blob: 5f56cf7db0ec1c25ab36e136046ff4db2122dd2b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200522#include "stringlib/unicodedefs.h"
523#include "stringlib/fastsearch.h"
524#include "stringlib/count.h"
525#include "stringlib/find.h"
526
Guido van Rossumd57fd912000-03-10 22:53:23 +0000527/* --- Unicode Object ----------------------------------------------------- */
528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200530fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200531
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200532Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
533 Py_ssize_t size, Py_UCS4 ch,
534 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
537
538 switch (kind) {
539 case PyUnicode_1BYTE_KIND:
540 {
541 Py_UCS1 ch1 = (Py_UCS1) ch;
542 if (ch1 == ch)
543 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
544 else
545 return -1;
546 }
547 case PyUnicode_2BYTE_KIND:
548 {
549 Py_UCS2 ch2 = (Py_UCS2) ch;
550 if (ch2 == ch)
551 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
552 else
553 return -1;
554 }
555 case PyUnicode_4BYTE_KIND:
556 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
557 default:
558 assert(0);
559 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561}
562
Victor Stinnerfe226c02011-10-03 03:52:20 +0200563static PyObject*
564resize_compact(PyObject *unicode, Py_ssize_t length)
565{
566 Py_ssize_t char_size;
567 Py_ssize_t struct_size;
568 Py_ssize_t new_size;
569 int share_wstr;
570
571 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200572 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 if (PyUnicode_IS_COMPACT_ASCII(unicode))
574 struct_size = sizeof(PyASCIIObject);
575 else
576 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200577 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578
579 _Py_DEC_REFTOTAL;
580 _Py_ForgetReference(unicode);
581
582 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
583 PyErr_NoMemory();
584 return NULL;
585 }
586 new_size = (struct_size + (length + 1) * char_size);
587
588 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
589 if (unicode == NULL) {
590 PyObject_Del(unicode);
591 PyErr_NoMemory();
592 return NULL;
593 }
594 _Py_NewReference(unicode);
595 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200596 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200597 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200598 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
599 _PyUnicode_WSTR_LENGTH(unicode) = length;
600 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
602 length, 0);
603 return unicode;
604}
605
Alexander Belopolsky40018472011-02-26 01:02:56 +0000606static int
Victor Stinner95663112011-10-04 01:03:50 +0200607resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
Victor Stinner95663112011-10-04 01:03:50 +0200609 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200611 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000612
Victor Stinner95663112011-10-04 01:03:50 +0200613 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614
615 if (PyUnicode_IS_READY(unicode)) {
616 Py_ssize_t char_size;
617 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200618 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200619 void *data;
620
621 data = _PyUnicode_DATA_ANY(unicode);
622 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200623 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200624 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
625 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200626 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
627 {
628 PyObject_DEL(_PyUnicode_UTF8(unicode));
629 _PyUnicode_UTF8(unicode) = NULL;
630 _PyUnicode_UTF8_LENGTH(unicode) = 0;
631 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200632
633 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
634 PyErr_NoMemory();
635 return -1;
636 }
637 new_size = (length + 1) * char_size;
638
639 data = (PyObject *)PyObject_REALLOC(data, new_size);
640 if (data == NULL) {
641 PyErr_NoMemory();
642 return -1;
643 }
644 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200645 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200647 _PyUnicode_WSTR_LENGTH(unicode) = length;
648 }
649 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200650 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_UTF8_LENGTH(unicode) = length;
652 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 _PyUnicode_LENGTH(unicode) = length;
654 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200655 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200656 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 }
Victor Stinner95663112011-10-04 01:03:50 +0200660 assert(_PyUnicode_WSTR(unicode) != NULL);
661
662 /* check for integer overflow */
663 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
664 PyErr_NoMemory();
665 return -1;
666 }
667 wstr = _PyUnicode_WSTR(unicode);
668 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
669 if (!wstr) {
670 PyErr_NoMemory();
671 return -1;
672 }
673 _PyUnicode_WSTR(unicode) = wstr;
674 _PyUnicode_WSTR(unicode)[length] = 0;
675 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200676 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000677 return 0;
678}
679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680static PyObject*
681resize_copy(PyObject *unicode, Py_ssize_t length)
682{
683 Py_ssize_t copy_length;
684 if (PyUnicode_IS_COMPACT(unicode)) {
685 PyObject *copy;
686 assert(PyUnicode_IS_READY(unicode));
687
688 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
689 if (copy == NULL)
690 return NULL;
691
692 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200693 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200695 }
696 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200697 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(_PyUnicode_WSTR(unicode) != NULL);
699 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200700 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 if (w == NULL)
702 return NULL;
703 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
704 copy_length = Py_MIN(copy_length, length);
705 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
706 copy_length);
707 return (PyObject*)w;
708 }
709}
710
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000712 Ux0000 terminated; some code (e.g. new_identifier)
713 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000716 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717
718*/
719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200721static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722#endif
723
Alexander Belopolsky40018472011-02-26 01:02:56 +0000724static PyUnicodeObject *
725_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726{
727 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200728 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 if (length == 0 && unicode_empty != NULL) {
732 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200733 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734 }
735
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000736 /* Ensure we won't overflow the size. */
737 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
738 return (PyUnicodeObject *)PyErr_NoMemory();
739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 if (length < 0) {
741 PyErr_SetString(PyExc_SystemError,
742 "Negative size passed to _PyUnicode_New");
743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746#ifdef Py_DEBUG
747 ++unicode_old_new_calls;
748#endif
749
750 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
751 if (unicode == NULL)
752 return NULL;
753 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
754 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
755 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000756 PyErr_NoMemory();
757 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200759
Jeremy Hyltond8082792003-09-16 19:41:39 +0000760 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000761 * the caller fails before initializing str -- unicode_resize()
762 * reads str[0], and the Keep-Alive optimization can keep memory
763 * allocated for str alive across a call to unicode_dealloc(unicode).
764 * We don't want unicode_resize to read uninitialized memory in
765 * that case.
766 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200767 _PyUnicode_WSTR(unicode)[0] = 0;
768 _PyUnicode_WSTR(unicode)[length] = 0;
769 _PyUnicode_WSTR_LENGTH(unicode) = length;
770 _PyUnicode_HASH(unicode) = -1;
771 _PyUnicode_STATE(unicode).interned = 0;
772 _PyUnicode_STATE(unicode).kind = 0;
773 _PyUnicode_STATE(unicode).compact = 0;
774 _PyUnicode_STATE(unicode).ready = 0;
775 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200776 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200777 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200778 _PyUnicode_UTF8(unicode) = NULL;
779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000781
Benjamin Peterson29060642009-01-31 22:14:21 +0000782 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000783 /* XXX UNREF/NEWREF interface should be more symmetrical */
784 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000785 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000786 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788}
789
Victor Stinnerf42dc442011-10-02 23:33:16 +0200790static const char*
791unicode_kind_name(PyObject *unicode)
792{
Victor Stinner42dfd712011-10-03 14:41:45 +0200793 /* don't check consistency: unicode_kind_name() is called from
794 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200795 if (!PyUnicode_IS_COMPACT(unicode))
796 {
797 if (!PyUnicode_IS_READY(unicode))
798 return "wstr";
799 switch(PyUnicode_KIND(unicode))
800 {
801 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200802 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200803 return "legacy ascii";
804 else
805 return "legacy latin1";
806 case PyUnicode_2BYTE_KIND:
807 return "legacy UCS2";
808 case PyUnicode_4BYTE_KIND:
809 return "legacy UCS4";
810 default:
811 return "<legacy invalid kind>";
812 }
813 }
814 assert(PyUnicode_IS_READY(unicode));
815 switch(PyUnicode_KIND(unicode))
816 {
817 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200818 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200819 return "ascii";
820 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200821 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200822 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 default:
827 return "<invalid compact kind>";
828 }
829}
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200832static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833
834/* Functions wrapping macros for use in debugger */
835char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200836 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200837}
838
839void *_PyUnicode_compact_data(void *unicode) {
840 return _PyUnicode_COMPACT_DATA(unicode);
841}
842void *_PyUnicode_data(void *unicode){
843 printf("obj %p\n", unicode);
844 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
845 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
846 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
847 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
848 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
849 return PyUnicode_DATA(unicode);
850}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200851
852void
853_PyUnicode_Dump(PyObject *op)
854{
855 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200856 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
857 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
858 void *data;
859 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
860 if (ascii->state.compact)
861 data = (compact + 1);
862 else
863 data = unicode->data.any;
864 if (ascii->wstr == data)
865 printf("shared ");
866 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200867 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200868 printf(" (%zu), ", compact->wstr_length);
869 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
870 printf("shared ");
871 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200872 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200874}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875#endif
876
877PyObject *
878PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
879{
880 PyObject *obj;
881 PyCompactUnicodeObject *unicode;
882 void *data;
883 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200884 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885 Py_ssize_t char_size;
886 Py_ssize_t struct_size;
887
888 /* Optimization for empty strings */
889 if (size == 0 && unicode_empty != NULL) {
890 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200891 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 }
893
894#ifdef Py_DEBUG
895 ++unicode_new_new_calls;
896#endif
897
Victor Stinner9e9d6892011-10-04 01:02:02 +0200898 is_ascii = 0;
899 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 struct_size = sizeof(PyCompactUnicodeObject);
901 if (maxchar < 128) {
902 kind_state = PyUnicode_1BYTE_KIND;
903 char_size = 1;
904 is_ascii = 1;
905 struct_size = sizeof(PyASCIIObject);
906 }
907 else if (maxchar < 256) {
908 kind_state = PyUnicode_1BYTE_KIND;
909 char_size = 1;
910 }
911 else if (maxchar < 65536) {
912 kind_state = PyUnicode_2BYTE_KIND;
913 char_size = 2;
914 if (sizeof(wchar_t) == 2)
915 is_sharing = 1;
916 }
917 else {
918 kind_state = PyUnicode_4BYTE_KIND;
919 char_size = 4;
920 if (sizeof(wchar_t) == 4)
921 is_sharing = 1;
922 }
923
924 /* Ensure we won't overflow the size. */
925 if (size < 0) {
926 PyErr_SetString(PyExc_SystemError,
927 "Negative size passed to PyUnicode_New");
928 return NULL;
929 }
930 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
931 return PyErr_NoMemory();
932
933 /* Duplicated allocation code from _PyObject_New() instead of a call to
934 * PyObject_New() so we are able to allocate space for the object and
935 * it's data buffer.
936 */
937 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
938 if (obj == NULL)
939 return PyErr_NoMemory();
940 obj = PyObject_INIT(obj, &PyUnicode_Type);
941 if (obj == NULL)
942 return NULL;
943
944 unicode = (PyCompactUnicodeObject *)obj;
945 if (is_ascii)
946 data = ((PyASCIIObject*)obj) + 1;
947 else
948 data = unicode + 1;
949 _PyUnicode_LENGTH(unicode) = size;
950 _PyUnicode_HASH(unicode) = -1;
951 _PyUnicode_STATE(unicode).interned = 0;
952 _PyUnicode_STATE(unicode).kind = kind_state;
953 _PyUnicode_STATE(unicode).compact = 1;
954 _PyUnicode_STATE(unicode).ready = 1;
955 _PyUnicode_STATE(unicode).ascii = is_ascii;
956 if (is_ascii) {
957 ((char*)data)[size] = 0;
958 _PyUnicode_WSTR(unicode) = NULL;
959 }
960 else if (kind_state == PyUnicode_1BYTE_KIND) {
961 ((char*)data)[size] = 0;
962 _PyUnicode_WSTR(unicode) = NULL;
963 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200965 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 }
967 else {
968 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 if (kind_state == PyUnicode_2BYTE_KIND)
971 ((Py_UCS2*)data)[size] = 0;
972 else /* kind_state == PyUnicode_4BYTE_KIND */
973 ((Py_UCS4*)data)[size] = 0;
974 if (is_sharing) {
975 _PyUnicode_WSTR_LENGTH(unicode) = size;
976 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
977 }
978 else {
979 _PyUnicode_WSTR_LENGTH(unicode) = 0;
980 _PyUnicode_WSTR(unicode) = NULL;
981 }
982 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200983 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 return obj;
985}
986
987#if SIZEOF_WCHAR_T == 2
988/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
989 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200990 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200991
992 This function assumes that unicode can hold one more code point than wstr
993 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200994static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
996 PyUnicodeObject *unicode)
997{
998 const wchar_t *iter;
999 Py_UCS4 *ucs4_out;
1000
Victor Stinner910337b2011-10-03 03:20:16 +02001001 assert(unicode != NULL);
1002 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1004 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1005
1006 for (iter = begin; iter < end; ) {
1007 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1008 _PyUnicode_GET_LENGTH(unicode)));
1009 if (*iter >= 0xD800 && *iter <= 0xDBFF
1010 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1011 {
1012 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1013 iter += 2;
1014 }
1015 else {
1016 *ucs4_out++ = *iter;
1017 iter++;
1018 }
1019 }
1020 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1021 _PyUnicode_GET_LENGTH(unicode)));
1022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023}
1024#endif
1025
Victor Stinnercd9950f2011-10-02 00:34:53 +02001026static int
1027_PyUnicode_Dirty(PyObject *unicode)
1028{
Victor Stinner910337b2011-10-03 03:20:16 +02001029 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001030 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001031 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001032 "Cannot modify a string having more than 1 reference");
1033 return -1;
1034 }
1035 _PyUnicode_DIRTY(unicode);
1036 return 0;
1037}
1038
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001039static int
1040_copy_characters(PyObject *to, Py_ssize_t to_start,
1041 PyObject *from, Py_ssize_t from_start,
1042 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 unsigned int from_kind, to_kind;
1045 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001046 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001048 assert(PyUnicode_Check(from));
1049 assert(PyUnicode_Check(to));
1050 assert(PyUnicode_IS_READY(from));
1051 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1054 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1055 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001057 if (how_many == 0)
1058 return 0;
1059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001061 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001063 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001065#ifdef Py_DEBUG
1066 if (!check_maxchar
1067 && (from_kind > to_kind
1068 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001069 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001070 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1071 Py_UCS4 ch;
1072 Py_ssize_t i;
1073 for (i=0; i < how_many; i++) {
1074 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1075 assert(ch <= to_maxchar);
1076 }
1077 }
1078#endif
1079 fast = (from_kind == to_kind);
1080 if (check_maxchar
1081 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1082 {
1083 /* deny latin1 => ascii */
1084 fast = 0;
1085 }
1086
1087 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001088 Py_MEMCPY((char*)to_data + to_kind * to_start,
1089 (char*)from_data + from_kind * from_start,
1090 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001092 else if (from_kind == PyUnicode_1BYTE_KIND
1093 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001094 {
1095 _PyUnicode_CONVERT_BYTES(
1096 Py_UCS1, Py_UCS2,
1097 PyUnicode_1BYTE_DATA(from) + from_start,
1098 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1099 PyUnicode_2BYTE_DATA(to) + to_start
1100 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001101 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001102 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001103 && to_kind == PyUnicode_4BYTE_KIND)
1104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS1, Py_UCS4,
1107 PyUnicode_1BYTE_DATA(from) + from_start,
1108 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_4BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
1112 else if (from_kind == PyUnicode_2BYTE_KIND
1113 && to_kind == PyUnicode_4BYTE_KIND)
1114 {
1115 _PyUnicode_CONVERT_BYTES(
1116 Py_UCS2, Py_UCS4,
1117 PyUnicode_2BYTE_DATA(from) + from_start,
1118 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1119 PyUnicode_4BYTE_DATA(to) + to_start
1120 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001121 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001122 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001123 /* check if max_char(from substring) <= max_char(to) */
1124 if (from_kind > to_kind
1125 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001126 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001127 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001128 /* slow path to check for character overflow */
1129 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 Py_ssize_t i;
1132
Victor Stinner56c161a2011-10-06 02:47:11 +02001133#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001134 for (i=0; i < how_many; i++) {
1135 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001136 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001137 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1138 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001139#else
1140 if (!check_maxchar) {
1141 for (i=0; i < how_many; i++) {
1142 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1143 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1144 }
1145 }
1146 else {
1147 for (i=0; i < how_many; i++) {
1148 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1149 if (ch > to_maxchar)
1150 return 1;
1151 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1152 }
1153 }
1154#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001155 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001157 assert(0 && "inconsistent state");
1158 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 }
1160 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 return 0;
1162}
1163
1164static void
1165copy_characters(PyObject *to, Py_ssize_t to_start,
1166 PyObject *from, Py_ssize_t from_start,
1167 Py_ssize_t how_many)
1168{
1169 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1170}
1171
1172Py_ssize_t
1173PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1174 PyObject *from, Py_ssize_t from_start,
1175 Py_ssize_t how_many)
1176{
1177 int err;
1178
1179 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1180 PyErr_BadInternalCall();
1181 return -1;
1182 }
1183
1184 if (PyUnicode_READY(from))
1185 return -1;
1186 if (PyUnicode_READY(to))
1187 return -1;
1188
1189 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1190 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1191 PyErr_Format(PyExc_SystemError,
1192 "Cannot write %zi characters at %zi "
1193 "in a string of %zi characters",
1194 how_many, to_start, PyUnicode_GET_LENGTH(to));
1195 return -1;
1196 }
1197
1198 if (how_many == 0)
1199 return 0;
1200
1201 if (_PyUnicode_Dirty(to))
1202 return -1;
1203
1204 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1205 if (err) {
1206 PyErr_Format(PyExc_SystemError,
1207 "Cannot copy %s characters "
1208 "into a string of %s characters",
1209 unicode_kind_name(from),
1210 unicode_kind_name(to));
1211 return -1;
1212 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214}
1215
Victor Stinner17222162011-09-28 22:15:37 +02001216/* Find the maximum code point and count the number of surrogate pairs so a
1217 correct string length can be computed before converting a string to UCS4.
1218 This function counts single surrogates as a character and not as a pair.
1219
1220 Return 0 on success, or -1 on error. */
1221static int
1222find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1223 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224{
1225 const wchar_t *iter;
1226
Victor Stinnerc53be962011-10-02 21:33:54 +02001227 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 *num_surrogates = 0;
1229 *maxchar = 0;
1230
1231 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001232 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001234#if SIZEOF_WCHAR_T != 2
1235 if (*maxchar >= 0x10000)
1236 return 0;
1237#endif
1238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239#if SIZEOF_WCHAR_T == 2
1240 if (*iter >= 0xD800 && *iter <= 0xDBFF
1241 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1242 {
1243 Py_UCS4 surrogate_val;
1244 surrogate_val = (((iter[0] & 0x3FF)<<10)
1245 | (iter[1] & 0x3FF)) + 0x10000;
1246 ++(*num_surrogates);
1247 if (surrogate_val > *maxchar)
1248 *maxchar = surrogate_val;
1249 iter += 2;
1250 }
1251 else
1252 iter++;
1253#else
1254 iter++;
1255#endif
1256 }
1257 return 0;
1258}
1259
1260#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001261static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262#endif
1263
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001264static int
1265unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001267 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 wchar_t *end;
1269 Py_UCS4 maxchar = 0;
1270 Py_ssize_t num_surrogates;
1271#if SIZEOF_WCHAR_T == 2
1272 Py_ssize_t length_wo_surrogates;
1273#endif
1274
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001275 assert(p_obj != NULL);
1276 unicode = (PyUnicodeObject *)*p_obj;
1277
Georg Brandl7597add2011-10-05 16:36:47 +02001278 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001279 strings were created using _PyObject_New() and where no canonical
1280 representation (the str field) has been set yet aka strings
1281 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001282 assert(_PyUnicode_CHECK(unicode));
1283 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001285 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001286 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001287 /* Actually, it should neither be interned nor be anything else: */
1288 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289
1290#ifdef Py_DEBUG
1291 ++unicode_ready_calls;
1292#endif
1293
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001294#ifdef Py_DEBUG
1295 assert(!replace || Py_REFCNT(unicode) == 1);
1296#else
1297 if (replace && Py_REFCNT(unicode) != 1)
1298 replace = 0;
1299#endif
1300 if (replace) {
1301 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1302 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1303 /* Optimization for empty strings */
1304 if (len == 0) {
1305 Py_INCREF(unicode_empty);
1306 Py_DECREF(*p_obj);
1307 *p_obj = unicode_empty;
1308 return 0;
1309 }
1310 if (len == 1 && wstr[0] < 256) {
1311 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1312 if (latin1_char == NULL)
1313 return -1;
1314 Py_DECREF(*p_obj);
1315 *p_obj = latin1_char;
1316 return 0;
1317 }
1318 }
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001321 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001322 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324
1325 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001326 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1327 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 PyErr_NoMemory();
1329 return -1;
1330 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001331 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 _PyUnicode_WSTR(unicode), end,
1333 PyUnicode_1BYTE_DATA(unicode));
1334 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1335 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1336 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1337 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001338 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001339 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001340 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 }
1342 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001343 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001344 _PyUnicode_UTF8(unicode) = NULL;
1345 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 PyObject_FREE(_PyUnicode_WSTR(unicode));
1348 _PyUnicode_WSTR(unicode) = NULL;
1349 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1350 }
1351 /* In this case we might have to convert down from 4-byte native
1352 wchar_t to 2-byte unicode. */
1353 else if (maxchar < 65536) {
1354 assert(num_surrogates == 0 &&
1355 "FindMaxCharAndNumSurrogatePairs() messed up");
1356
Victor Stinner506f5922011-09-28 22:34:18 +02001357#if SIZEOF_WCHAR_T == 2
1358 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001360 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1361 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1362 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001363 _PyUnicode_UTF8(unicode) = NULL;
1364 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001365#else
1366 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001368 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyErr_NoMemory();
1371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 }
Victor Stinner506f5922011-09-28 22:34:18 +02001373 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1374 _PyUnicode_WSTR(unicode), end,
1375 PyUnicode_2BYTE_DATA(unicode));
1376 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1377 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1378 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001379 _PyUnicode_UTF8(unicode) = NULL;
1380 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001381 PyObject_FREE(_PyUnicode_WSTR(unicode));
1382 _PyUnicode_WSTR(unicode) = NULL;
1383 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1384#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1387 else {
1388#if SIZEOF_WCHAR_T == 2
1389 /* in case the native representation is 2-bytes, we need to allocate a
1390 new normalized 4-byte version. */
1391 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1393 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 PyErr_NoMemory();
1395 return -1;
1396 }
1397 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1398 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001399 _PyUnicode_UTF8(unicode) = NULL;
1400 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001401 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1402 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001403 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyObject_FREE(_PyUnicode_WSTR(unicode));
1405 _PyUnicode_WSTR(unicode) = NULL;
1406 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1407#else
1408 assert(num_surrogates == 0);
1409
Victor Stinnerc3c74152011-10-02 20:39:55 +02001410 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001412 _PyUnicode_UTF8(unicode) = NULL;
1413 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1415#endif
1416 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1417 }
1418 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001419 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 return 0;
1421}
1422
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001423int
1424_PyUnicode_ReadyReplace(PyObject **op)
1425{
1426 return unicode_ready(op, 1);
1427}
1428
1429int
1430_PyUnicode_Ready(PyObject *op)
1431{
1432 return unicode_ready(&op, 0);
1433}
1434
Alexander Belopolsky40018472011-02-26 01:02:56 +00001435static void
1436unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437{
Walter Dörwald16807132007-05-25 13:52:07 +00001438 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 case SSTATE_NOT_INTERNED:
1440 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001441
Benjamin Peterson29060642009-01-31 22:14:21 +00001442 case SSTATE_INTERNED_MORTAL:
1443 /* revive dead object temporarily for DelItem */
1444 Py_REFCNT(unicode) = 3;
1445 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1446 Py_FatalError(
1447 "deletion of interned string failed");
1448 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001449
Benjamin Peterson29060642009-01-31 22:14:21 +00001450 case SSTATE_INTERNED_IMMORTAL:
1451 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001452
Benjamin Peterson29060642009-01-31 22:14:21 +00001453 default:
1454 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001455 }
1456
Victor Stinner03490912011-10-03 23:45:12 +02001457 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001459 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461
1462 if (PyUnicode_IS_COMPACT(unicode)) {
1463 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 }
1465 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 if (_PyUnicode_DATA_ANY(unicode))
1467 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
1470}
1471
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001472#ifdef Py_DEBUG
1473static int
1474unicode_is_singleton(PyObject *unicode)
1475{
1476 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1477 if (unicode == unicode_empty)
1478 return 1;
1479 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1480 {
1481 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1482 if (ch < 256 && unicode_latin1[ch] == unicode)
1483 return 1;
1484 }
1485 return 0;
1486}
1487#endif
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001490unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001491{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001492 if (Py_REFCNT(unicode) != 1)
1493 return 0;
1494 if (PyUnicode_CHECK_INTERNED(unicode))
1495 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001496#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001497 /* singleton refcount is greater than 1 */
1498 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001499#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500 return 1;
1501}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001502
Victor Stinnerfe226c02011-10-03 03:52:20 +02001503static int
1504unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1505{
1506 PyObject *unicode;
1507 Py_ssize_t old_length;
1508
1509 assert(p_unicode != NULL);
1510 unicode = *p_unicode;
1511
1512 assert(unicode != NULL);
1513 assert(PyUnicode_Check(unicode));
1514 assert(0 <= length);
1515
Victor Stinner910337b2011-10-03 03:20:16 +02001516 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001517 old_length = PyUnicode_WSTR_LENGTH(unicode);
1518 else
1519 old_length = PyUnicode_GET_LENGTH(unicode);
1520 if (old_length == length)
1521 return 0;
1522
Victor Stinnerfe226c02011-10-03 03:52:20 +02001523 if (!unicode_resizable(unicode)) {
1524 PyObject *copy = resize_copy(unicode, length);
1525 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 Py_DECREF(*p_unicode);
1528 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001530 }
1531
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (PyUnicode_IS_COMPACT(unicode)) {
1533 *p_unicode = resize_compact(unicode, length);
1534 if (*p_unicode == NULL)
1535 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001536 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001538 }
1539 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001540}
1541
Alexander Belopolsky40018472011-02-26 01:02:56 +00001542int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001544{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 PyObject *unicode;
1546 if (p_unicode == NULL) {
1547 PyErr_BadInternalCall();
1548 return -1;
1549 }
1550 unicode = *p_unicode;
1551 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1552 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1553 {
1554 PyErr_BadInternalCall();
1555 return -1;
1556 }
1557 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001558}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560static PyObject*
1561get_latin1_char(unsigned char ch)
1562{
Victor Stinnera464fc12011-10-02 20:39:30 +02001563 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001565 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 if (!unicode)
1567 return NULL;
1568 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001569 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 unicode_latin1[ch] = unicode;
1571 }
1572 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001573 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574}
1575
Alexander Belopolsky40018472011-02-26 01:02:56 +00001576PyObject *
1577PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
1579 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 Py_UCS4 maxchar = 0;
1581 Py_ssize_t num_surrogates;
1582
1583 if (u == NULL)
1584 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586 /* If the Unicode data is known at construction time, we can apply
1587 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 /* Optimization for empty strings */
1590 if (size == 0 && unicode_empty != NULL) {
1591 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001592 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001593 }
Tim Petersced69f82003-09-16 20:30:58 +00001594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 /* Single character Unicode objects in the Latin-1 range are
1596 shared when using this constructor */
1597 if (size == 1 && *u < 256)
1598 return get_latin1_char((unsigned char)*u);
1599
1600 /* If not empty and not single character, copy the Unicode data
1601 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001602 if (find_maxchar_surrogates(u, u + size,
1603 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 return NULL;
1605
1606 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1607 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 if (!unicode)
1609 return NULL;
1610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 switch (PyUnicode_KIND(unicode)) {
1612 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001613 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1615 break;
1616 case PyUnicode_2BYTE_KIND:
1617#if Py_UNICODE_SIZE == 2
1618 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1619#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001620 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1622#endif
1623 break;
1624 case PyUnicode_4BYTE_KIND:
1625#if SIZEOF_WCHAR_T == 2
1626 /* This is the only case which has to process surrogates, thus
1627 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001628 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629#else
1630 assert(num_surrogates == 0);
1631 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1632#endif
1633 break;
1634 default:
1635 assert(0 && "Impossible state");
1636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001638 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 return (PyObject *)unicode;
1640}
1641
Alexander Belopolsky40018472011-02-26 01:02:56 +00001642PyObject *
1643PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001644{
1645 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001646
Benjamin Peterson14339b62009-01-31 16:36:08 +00001647 if (size < 0) {
1648 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001649 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001650 return NULL;
1651 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001652
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001653 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001654 some optimizations which share commonly used objects.
1655 Also, this means the input must be UTF-8, so fall back to the
1656 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001657 if (u != NULL) {
1658
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 /* Optimization for empty strings */
1660 if (size == 0 && unicode_empty != NULL) {
1661 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001662 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001663 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001664
1665 /* Single characters are shared when using this constructor.
1666 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 if (size == 1 && Py_CHARMASK(*u) < 128)
1668 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001669
1670 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001671 }
1672
Walter Dörwald55507312007-05-18 13:12:10 +00001673 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001674 if (!unicode)
1675 return NULL;
1676
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001677 return (PyObject *)unicode;
1678}
1679
Alexander Belopolsky40018472011-02-26 01:02:56 +00001680PyObject *
1681PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001682{
1683 size_t size = strlen(u);
1684 if (size > PY_SSIZE_T_MAX) {
1685 PyErr_SetString(PyExc_OverflowError, "input too long");
1686 return NULL;
1687 }
1688
1689 return PyUnicode_FromStringAndSize(u, size);
1690}
1691
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001692PyObject *
1693_PyUnicode_FromId(_Py_Identifier *id)
1694{
1695 if (!id->object) {
1696 id->object = PyUnicode_FromString(id->string);
1697 if (!id->object)
1698 return NULL;
1699 PyUnicode_InternInPlace(&id->object);
1700 assert(!id->next);
1701 id->next = static_strings;
1702 static_strings = id;
1703 }
1704 Py_INCREF(id->object);
1705 return id->object;
1706}
1707
1708void
1709_PyUnicode_ClearStaticStrings()
1710{
1711 _Py_Identifier *i;
1712 for (i = static_strings; i; i = i->next) {
1713 Py_DECREF(i->object);
1714 i->object = NULL;
1715 i->next = NULL;
1716 }
1717}
1718
Victor Stinnere57b1c02011-09-28 22:20:48 +02001719static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001720unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001721{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001722 PyObject *res;
1723#ifdef Py_DEBUG
1724 const unsigned char *p;
1725 const unsigned char *end = s + size;
1726 for (p=s; p < end; p++) {
1727 assert(*p < 128);
1728 }
1729#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001730 if (size == 1)
1731 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001732 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001733 if (!res)
1734 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001735 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001736 return res;
1737}
1738
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001739static Py_UCS4
1740kind_maxchar_limit(unsigned int kind)
1741{
1742 switch(kind) {
1743 case PyUnicode_1BYTE_KIND:
1744 return 0x80;
1745 case PyUnicode_2BYTE_KIND:
1746 return 0x100;
1747 case PyUnicode_4BYTE_KIND:
1748 return 0x10000;
1749 default:
1750 assert(0 && "invalid kind");
1751 return 0x10ffff;
1752 }
1753}
1754
Victor Stinner702c7342011-10-05 13:50:52 +02001755static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001756_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001759 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001760
1761 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001762 if (size == 1)
1763 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001764 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001765 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 if (!res)
1767 return NULL;
1768 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001769 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001771}
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
1774_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775{
1776 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001777 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001778
1779 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001780 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001781 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001782 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 if (!res)
1785 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001786 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001788 else {
1789 _PyUnicode_CONVERT_BYTES(
1790 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1791 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001792 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return res;
1794}
1795
Victor Stinnere57b1c02011-09-28 22:20:48 +02001796static PyObject*
1797_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798{
1799 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001800 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001801
1802 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001803 if (size == 1 && u[0] < 256)
1804 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001805 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 if (!res)
1808 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001809 if (max_char < 256)
1810 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1811 PyUnicode_1BYTE_DATA(res));
1812 else if (max_char < 0x10000)
1813 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1814 PyUnicode_2BYTE_DATA(res));
1815 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001817 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return res;
1819}
1820
1821PyObject*
1822PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1823{
1824 switch(kind) {
1825 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001828 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001831 default:
1832 assert(0 && "invalid kind");
1833 PyErr_SetString(PyExc_SystemError, "invalid kind");
1834 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836}
1837
Victor Stinner25a4b292011-10-06 12:31:55 +02001838/* Ensure that a string uses the most efficient storage, if it is not the
1839 case: create a new string with of the right kind. Write NULL into *p_unicode
1840 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001841static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001842unicode_adjust_maxchar(PyObject **p_unicode)
1843{
1844 PyObject *unicode, *copy;
1845 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001846 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001847 unsigned int kind;
1848
1849 assert(p_unicode != NULL);
1850 unicode = *p_unicode;
1851 assert(PyUnicode_IS_READY(unicode));
1852 if (PyUnicode_IS_ASCII(unicode))
1853 return;
1854
1855 len = PyUnicode_GET_LENGTH(unicode);
1856 kind = PyUnicode_KIND(unicode);
1857 if (kind == PyUnicode_1BYTE_KIND) {
1858 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001859 max_char = ucs1lib_find_max_char(u, u + len);
1860 if (max_char >= 128)
1861 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001862 }
1863 else if (kind == PyUnicode_2BYTE_KIND) {
1864 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001865 max_char = ucs2lib_find_max_char(u, u + len);
1866 if (max_char >= 256)
1867 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001868 }
1869 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001870 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001871 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + len);
1873 if (max_char >= 0x10000)
1874 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001875 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001876 copy = PyUnicode_New(len, max_char);
1877 copy_characters(copy, 0, unicode, 0, len);
1878 Py_DECREF(unicode);
1879 *p_unicode = copy;
1880}
1881
Victor Stinner034f6cf2011-09-30 02:26:44 +02001882PyObject*
1883PyUnicode_Copy(PyObject *unicode)
1884{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001885 Py_ssize_t size;
1886 PyObject *copy;
1887 void *data;
1888
Victor Stinner034f6cf2011-09-30 02:26:44 +02001889 if (!PyUnicode_Check(unicode)) {
1890 PyErr_BadInternalCall();
1891 return NULL;
1892 }
1893 if (PyUnicode_READY(unicode))
1894 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001895
1896 size = PyUnicode_GET_LENGTH(unicode);
1897 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1898 if (!copy)
1899 return NULL;
1900 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1901
1902 data = PyUnicode_DATA(unicode);
1903 switch (PyUnicode_KIND(unicode))
1904 {
1905 case PyUnicode_1BYTE_KIND:
1906 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1907 break;
1908 case PyUnicode_2BYTE_KIND:
1909 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1910 break;
1911 case PyUnicode_4BYTE_KIND:
1912 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1913 break;
1914 default:
1915 assert(0);
1916 break;
1917 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001919 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001920}
1921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922
Victor Stinnerbc603d12011-10-02 01:00:40 +02001923/* Widen Unicode objects to larger buffers. Don't write terminating null
1924 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925
1926void*
1927_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1928{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001929 Py_ssize_t len;
1930 void *result;
1931 unsigned int skind;
1932
1933 if (PyUnicode_READY(s))
1934 return NULL;
1935
1936 len = PyUnicode_GET_LENGTH(s);
1937 skind = PyUnicode_KIND(s);
1938 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001939 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 return NULL;
1941 }
1942 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001943 case PyUnicode_2BYTE_KIND:
1944 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1945 if (!result)
1946 return PyErr_NoMemory();
1947 assert(skind == PyUnicode_1BYTE_KIND);
1948 _PyUnicode_CONVERT_BYTES(
1949 Py_UCS1, Py_UCS2,
1950 PyUnicode_1BYTE_DATA(s),
1951 PyUnicode_1BYTE_DATA(s) + len,
1952 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001954 case PyUnicode_4BYTE_KIND:
1955 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1956 if (!result)
1957 return PyErr_NoMemory();
1958 if (skind == PyUnicode_2BYTE_KIND) {
1959 _PyUnicode_CONVERT_BYTES(
1960 Py_UCS2, Py_UCS4,
1961 PyUnicode_2BYTE_DATA(s),
1962 PyUnicode_2BYTE_DATA(s) + len,
1963 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001965 else {
1966 assert(skind == PyUnicode_1BYTE_KIND);
1967 _PyUnicode_CONVERT_BYTES(
1968 Py_UCS1, Py_UCS4,
1969 PyUnicode_1BYTE_DATA(s),
1970 PyUnicode_1BYTE_DATA(s) + len,
1971 result);
1972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001974 default:
1975 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 }
Victor Stinner01698042011-10-04 00:04:26 +02001977 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 return NULL;
1979}
1980
1981static Py_UCS4*
1982as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1983 int copy_null)
1984{
1985 int kind;
1986 void *data;
1987 Py_ssize_t len, targetlen;
1988 if (PyUnicode_READY(string) == -1)
1989 return NULL;
1990 kind = PyUnicode_KIND(string);
1991 data = PyUnicode_DATA(string);
1992 len = PyUnicode_GET_LENGTH(string);
1993 targetlen = len;
1994 if (copy_null)
1995 targetlen++;
1996 if (!target) {
1997 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1998 PyErr_NoMemory();
1999 return NULL;
2000 }
2001 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2002 if (!target) {
2003 PyErr_NoMemory();
2004 return NULL;
2005 }
2006 }
2007 else {
2008 if (targetsize < targetlen) {
2009 PyErr_Format(PyExc_SystemError,
2010 "string is longer than the buffer");
2011 if (copy_null && 0 < targetsize)
2012 target[0] = 0;
2013 return NULL;
2014 }
2015 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002016 if (kind == PyUnicode_1BYTE_KIND) {
2017 Py_UCS1 *start = (Py_UCS1 *) data;
2018 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002020 else if (kind == PyUnicode_2BYTE_KIND) {
2021 Py_UCS2 *start = (Py_UCS2 *) data;
2022 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2023 }
2024 else {
2025 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (copy_null)
2029 target[len] = 0;
2030 return target;
2031}
2032
2033Py_UCS4*
2034PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2035 int copy_null)
2036{
2037 if (target == NULL || targetsize < 1) {
2038 PyErr_BadInternalCall();
2039 return NULL;
2040 }
2041 return as_ucs4(string, target, targetsize, copy_null);
2042}
2043
2044Py_UCS4*
2045PyUnicode_AsUCS4Copy(PyObject *string)
2046{
2047 return as_ucs4(string, NULL, 0, 1);
2048}
2049
2050#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002051
Alexander Belopolsky40018472011-02-26 01:02:56 +00002052PyObject *
2053PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002056 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 PyErr_BadInternalCall();
2059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
2061
Martin v. Löwis790465f2008-04-05 20:41:37 +00002062 if (size == -1) {
2063 size = wcslen(w);
2064 }
2065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067}
2068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002070
Walter Dörwald346737f2007-05-31 10:44:43 +00002071static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002072makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2073 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002074{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 *fmt++ = '%';
2076 if (width) {
2077 if (zeropad)
2078 *fmt++ = '0';
2079 fmt += sprintf(fmt, "%d", width);
2080 }
2081 if (precision)
2082 fmt += sprintf(fmt, ".%d", precision);
2083 if (longflag)
2084 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002085 else if (longlongflag) {
2086 /* longlongflag should only ever be nonzero on machines with
2087 HAVE_LONG_LONG defined */
2088#ifdef HAVE_LONG_LONG
2089 char *f = PY_FORMAT_LONG_LONG;
2090 while (*f)
2091 *fmt++ = *f++;
2092#else
2093 /* we shouldn't ever get here */
2094 assert(0);
2095 *fmt++ = 'l';
2096#endif
2097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 else if (size_tflag) {
2099 char *f = PY_FORMAT_SIZE_T;
2100 while (*f)
2101 *fmt++ = *f++;
2102 }
2103 *fmt++ = c;
2104 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002105}
2106
Victor Stinner96865452011-03-01 23:44:09 +00002107/* helper for PyUnicode_FromFormatV() */
2108
2109static const char*
2110parse_format_flags(const char *f,
2111 int *p_width, int *p_precision,
2112 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2113{
2114 int width, precision, longflag, longlongflag, size_tflag;
2115
2116 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2117 f++;
2118 width = 0;
2119 while (Py_ISDIGIT((unsigned)*f))
2120 width = (width*10) + *f++ - '0';
2121 precision = 0;
2122 if (*f == '.') {
2123 f++;
2124 while (Py_ISDIGIT((unsigned)*f))
2125 precision = (precision*10) + *f++ - '0';
2126 if (*f == '%') {
2127 /* "%.3%s" => f points to "3" */
2128 f--;
2129 }
2130 }
2131 if (*f == '\0') {
2132 /* bogus format "%.1" => go backward, f points to "1" */
2133 f--;
2134 }
2135 if (p_width != NULL)
2136 *p_width = width;
2137 if (p_precision != NULL)
2138 *p_precision = precision;
2139
2140 /* Handle %ld, %lu, %lld and %llu. */
2141 longflag = 0;
2142 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002143 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002144
2145 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002146 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002147 longflag = 1;
2148 ++f;
2149 }
2150#ifdef HAVE_LONG_LONG
2151 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002152 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002153 longlongflag = 1;
2154 f += 2;
2155 }
2156#endif
2157 }
2158 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002159 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002160 size_tflag = 1;
2161 ++f;
2162 }
2163 if (p_longflag != NULL)
2164 *p_longflag = longflag;
2165 if (p_longlongflag != NULL)
2166 *p_longlongflag = longlongflag;
2167 if (p_size_tflag != NULL)
2168 *p_size_tflag = size_tflag;
2169 return f;
2170}
2171
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002172/* maximum number of characters required for output of %ld. 21 characters
2173 allows for 64-bit integers (in decimal) and an optional sign. */
2174#define MAX_LONG_CHARS 21
2175/* maximum number of characters required for output of %lld.
2176 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2177 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2178#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2179
Walter Dörwaldd2034312007-05-18 16:29:38 +00002180PyObject *
2181PyUnicode_FromFormatV(const char *format, va_list vargs)
2182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 va_list count;
2184 Py_ssize_t callcount = 0;
2185 PyObject **callresults = NULL;
2186 PyObject **callresult = NULL;
2187 Py_ssize_t n = 0;
2188 int width = 0;
2189 int precision = 0;
2190 int zeropad;
2191 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002192 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002193 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002194 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2196 Py_UCS4 argmaxchar;
2197 Py_ssize_t numbersize = 0;
2198 char *numberresults = NULL;
2199 char *numberresult = NULL;
2200 Py_ssize_t i;
2201 int kind;
2202 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002203
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002204 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002205 /* step 1: count the number of %S/%R/%A/%s format specifications
2206 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2207 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002209 * also estimate a upper bound for all the number formats in the string,
2210 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 for (f = format; *f; f++) {
2213 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002214 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2216 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2217 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2218 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002221#ifdef HAVE_LONG_LONG
2222 if (longlongflag) {
2223 if (width < MAX_LONG_LONG_CHARS)
2224 width = MAX_LONG_LONG_CHARS;
2225 }
2226 else
2227#endif
2228 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2229 including sign. Decimal takes the most space. This
2230 isn't enough for octal. If a width is specified we
2231 need more (which we allocate later). */
2232 if (width < MAX_LONG_CHARS)
2233 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234
2235 /* account for the size + '\0' to separate numbers
2236 inside of the numberresults buffer */
2237 numbersize += (width + 1);
2238 }
2239 }
2240 else if ((unsigned char)*f > 127) {
2241 PyErr_Format(PyExc_ValueError,
2242 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2243 "string, got a non-ASCII byte: 0x%02x",
2244 (unsigned char)*f);
2245 return NULL;
2246 }
2247 }
2248 /* step 2: allocate memory for the results of
2249 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2250 if (callcount) {
2251 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2252 if (!callresults) {
2253 PyErr_NoMemory();
2254 return NULL;
2255 }
2256 callresult = callresults;
2257 }
2258 /* step 2.5: allocate memory for the results of formating numbers */
2259 if (numbersize) {
2260 numberresults = PyObject_Malloc(numbersize);
2261 if (!numberresults) {
2262 PyErr_NoMemory();
2263 goto fail;
2264 }
2265 numberresult = numberresults;
2266 }
2267
2268 /* step 3: format numbers and figure out how large a buffer we need */
2269 for (f = format; *f; f++) {
2270 if (*f == '%') {
2271 const char* p;
2272 int longflag;
2273 int longlongflag;
2274 int size_tflag;
2275 int numprinted;
2276
2277 p = f;
2278 zeropad = (f[1] == '0');
2279 f = parse_format_flags(f, &width, &precision,
2280 &longflag, &longlongflag, &size_tflag);
2281 switch (*f) {
2282 case 'c':
2283 {
2284 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002285 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 n++;
2287 break;
2288 }
2289 case '%':
2290 n++;
2291 break;
2292 case 'i':
2293 case 'd':
2294 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2295 width, precision, *f);
2296 if (longflag)
2297 numprinted = sprintf(numberresult, fmt,
2298 va_arg(count, long));
2299#ifdef HAVE_LONG_LONG
2300 else if (longlongflag)
2301 numprinted = sprintf(numberresult, fmt,
2302 va_arg(count, PY_LONG_LONG));
2303#endif
2304 else if (size_tflag)
2305 numprinted = sprintf(numberresult, fmt,
2306 va_arg(count, Py_ssize_t));
2307 else
2308 numprinted = sprintf(numberresult, fmt,
2309 va_arg(count, int));
2310 n += numprinted;
2311 /* advance by +1 to skip over the '\0' */
2312 numberresult += (numprinted + 1);
2313 assert(*(numberresult - 1) == '\0');
2314 assert(*(numberresult - 2) != '\0');
2315 assert(numprinted >= 0);
2316 assert(numberresult <= numberresults + numbersize);
2317 break;
2318 case 'u':
2319 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2320 width, precision, 'u');
2321 if (longflag)
2322 numprinted = sprintf(numberresult, fmt,
2323 va_arg(count, unsigned long));
2324#ifdef HAVE_LONG_LONG
2325 else if (longlongflag)
2326 numprinted = sprintf(numberresult, fmt,
2327 va_arg(count, unsigned PY_LONG_LONG));
2328#endif
2329 else if (size_tflag)
2330 numprinted = sprintf(numberresult, fmt,
2331 va_arg(count, size_t));
2332 else
2333 numprinted = sprintf(numberresult, fmt,
2334 va_arg(count, unsigned int));
2335 n += numprinted;
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'x':
2343 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2344 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2345 n += numprinted;
2346 numberresult += (numprinted + 1);
2347 assert(*(numberresult - 1) == '\0');
2348 assert(*(numberresult - 2) != '\0');
2349 assert(numprinted >= 0);
2350 assert(numberresult <= numberresults + numbersize);
2351 break;
2352 case 'p':
2353 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2354 /* %p is ill-defined: ensure leading 0x. */
2355 if (numberresult[1] == 'X')
2356 numberresult[1] = 'x';
2357 else if (numberresult[1] != 'x') {
2358 memmove(numberresult + 2, numberresult,
2359 strlen(numberresult) + 1);
2360 numberresult[0] = '0';
2361 numberresult[1] = 'x';
2362 numprinted += 2;
2363 }
2364 n += numprinted;
2365 numberresult += (numprinted + 1);
2366 assert(*(numberresult - 1) == '\0');
2367 assert(*(numberresult - 2) != '\0');
2368 assert(numprinted >= 0);
2369 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002370 break;
2371 case 's':
2372 {
2373 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002374 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002375 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2376 if (!str)
2377 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 /* since PyUnicode_DecodeUTF8 returns already flexible
2379 unicode objects, there is no need to call ready on them */
2380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002381 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002383 /* Remember the str and switch to the next slot */
2384 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 break;
2386 }
2387 case 'U':
2388 {
2389 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002390 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 if (PyUnicode_READY(obj) == -1)
2392 goto fail;
2393 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002394 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002396 break;
2397 }
2398 case 'V':
2399 {
2400 PyObject *obj = va_arg(count, PyObject *);
2401 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002402 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002404 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002405 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 if (PyUnicode_READY(obj) == -1)
2407 goto fail;
2408 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002409 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002411 *callresult++ = NULL;
2412 }
2413 else {
2414 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2415 if (!str_obj)
2416 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002417 if (PyUnicode_READY(str_obj)) {
2418 Py_DECREF(str_obj);
2419 goto fail;
2420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002422 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002424 *callresult++ = str_obj;
2425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 break;
2427 }
2428 case 'S':
2429 {
2430 PyObject *obj = va_arg(count, PyObject *);
2431 PyObject *str;
2432 assert(obj);
2433 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002435 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002437 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002439 /* Remember the str and switch to the next slot */
2440 *callresult++ = str;
2441 break;
2442 }
2443 case 'R':
2444 {
2445 PyObject *obj = va_arg(count, PyObject *);
2446 PyObject *repr;
2447 assert(obj);
2448 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002450 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002452 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 /* Remember the repr and switch to the next slot */
2455 *callresult++ = repr;
2456 break;
2457 }
2458 case 'A':
2459 {
2460 PyObject *obj = va_arg(count, PyObject *);
2461 PyObject *ascii;
2462 assert(obj);
2463 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002467 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 /* Remember the repr and switch to the next slot */
2470 *callresult++ = ascii;
2471 break;
2472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 default:
2474 /* if we stumble upon an unknown
2475 formatting code, copy the rest of
2476 the format string to the output
2477 string. (we cannot just skip the
2478 code, since there's no way to know
2479 what's in the argument list) */
2480 n += strlen(p);
2481 goto expand;
2482 }
2483 } else
2484 n++;
2485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002487 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 we don't have to resize the string.
2490 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002491 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 if (!string)
2493 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 kind = PyUnicode_KIND(string);
2495 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002501 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002502
2503 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2505 /* checking for == because the last argument could be a empty
2506 string, which causes i to point to end, the assert at the end of
2507 the loop */
2508 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002509
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 switch (*f) {
2511 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002512 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 const int ordinal = va_arg(vargs, int);
2514 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002516 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002517 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 case 'p':
2522 /* unused, since we already have the result */
2523 if (*f == 'p')
2524 (void) va_arg(vargs, void *);
2525 else
2526 (void) va_arg(vargs, int);
2527 /* extract the result from numberresults and append. */
2528 for (; *numberresult; ++i, ++numberresult)
2529 PyUnicode_WRITE(kind, data, i, *numberresult);
2530 /* skip over the separating '\0' */
2531 assert(*numberresult == '\0');
2532 numberresult++;
2533 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 break;
2535 case 's':
2536 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002537 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002539 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 size = PyUnicode_GET_LENGTH(*callresult);
2541 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002542 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002544 /* We're done with the unicode()/repr() => forget it */
2545 Py_DECREF(*callresult);
2546 /* switch to next unicode()/repr() result */
2547 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 break;
2549 }
2550 case 'U':
2551 {
2552 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 Py_ssize_t size;
2554 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2555 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002556 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 break;
2559 }
2560 case 'V':
2561 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002564 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 size = PyUnicode_GET_LENGTH(obj);
2567 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002568 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 size = PyUnicode_GET_LENGTH(*callresult);
2572 assert(PyUnicode_KIND(*callresult) <=
2573 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002574 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002576 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002578 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 break;
2580 }
2581 case 'S':
2582 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002583 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002585 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 /* unused, since we already have the result */
2587 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002589 copy_characters(string, i, *callresult, 0, size);
2590 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 /* We're done with the unicode()/repr() => forget it */
2592 Py_DECREF(*callresult);
2593 /* switch to next unicode()/repr() result */
2594 ++callresult;
2595 break;
2596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 break;
2600 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 for (; *p; ++p, ++i)
2602 PyUnicode_WRITE(kind, data, i, *p);
2603 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 goto end;
2605 }
Victor Stinner1205f272010-09-11 00:54:47 +00002606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 else {
2608 assert(i < PyUnicode_GET_LENGTH(string));
2609 PyUnicode_WRITE(kind, data, i++, *f);
2610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002613
Benjamin Peterson29060642009-01-31 22:14:21 +00002614 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 if (callresults)
2616 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 if (numberresults)
2618 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002619 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (callresults) {
2623 PyObject **callresult2 = callresults;
2624 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002625 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 ++callresult2;
2627 }
2628 PyObject_Free(callresults);
2629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 if (numberresults)
2631 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002633}
2634
Walter Dörwaldd2034312007-05-18 16:29:38 +00002635PyObject *
2636PyUnicode_FromFormat(const char *format, ...)
2637{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 PyObject* ret;
2639 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640
2641#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002643#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002645#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 ret = PyUnicode_FromFormatV(format, vargs);
2647 va_end(vargs);
2648 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002649}
2650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651#ifdef HAVE_WCHAR_H
2652
Victor Stinner5593d8a2010-10-02 11:11:27 +00002653/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2654 convert a Unicode object to a wide character string.
2655
Victor Stinnerd88d9832011-09-06 02:00:05 +02002656 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 character) required to convert the unicode object. Ignore size argument.
2658
Victor Stinnerd88d9832011-09-06 02:00:05 +02002659 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002660 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002661 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002662static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002663unicode_aswidechar(PyUnicodeObject *unicode,
2664 wchar_t *w,
2665 Py_ssize_t size)
2666{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002667 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 const wchar_t *wstr;
2669
2670 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2671 if (wstr == NULL)
2672 return -1;
2673
Victor Stinner5593d8a2010-10-02 11:11:27 +00002674 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002675 if (size > res)
2676 size = res + 1;
2677 else
2678 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680 return res;
2681 }
2682 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002684}
2685
2686Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002687PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002688 wchar_t *w,
2689 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690{
2691 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 PyErr_BadInternalCall();
2693 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002695 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696}
2697
Victor Stinner137c34c2010-09-29 10:25:54 +00002698wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002699PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002700 Py_ssize_t *size)
2701{
2702 wchar_t* buffer;
2703 Py_ssize_t buflen;
2704
2705 if (unicode == NULL) {
2706 PyErr_BadInternalCall();
2707 return NULL;
2708 }
2709
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002710 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 if (buflen == -1)
2712 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002714 PyErr_NoMemory();
2715 return NULL;
2716 }
2717
Victor Stinner137c34c2010-09-29 10:25:54 +00002718 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2719 if (buffer == NULL) {
2720 PyErr_NoMemory();
2721 return NULL;
2722 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002723 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 if (buflen == -1)
2725 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726 if (size != NULL)
2727 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002728 return buffer;
2729}
2730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732
Alexander Belopolsky40018472011-02-26 01:02:56 +00002733PyObject *
2734PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002737 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 PyErr_SetString(PyExc_ValueError,
2739 "chr() arg not in range(0x110000)");
2740 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002741 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (ordinal < 256)
2744 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 v = PyUnicode_New(1, ordinal);
2747 if (v == NULL)
2748 return NULL;
2749 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002750 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002752}
2753
Alexander Belopolsky40018472011-02-26 01:02:56 +00002754PyObject *
2755PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002757 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002758 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002760 if (PyUnicode_READY(obj))
2761 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 Py_INCREF(obj);
2763 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002764 }
2765 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 /* For a Unicode subtype that's not a Unicode object,
2767 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002768 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002769 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002770 PyErr_Format(PyExc_TypeError,
2771 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002772 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002773 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002774}
2775
Alexander Belopolsky40018472011-02-26 01:02:56 +00002776PyObject *
2777PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002778 const char *encoding,
2779 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002780{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002781 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002782 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002783
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 PyErr_BadInternalCall();
2786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002788
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002789 /* Decoding bytes objects is the most common case and should be fast */
2790 if (PyBytes_Check(obj)) {
2791 if (PyBytes_GET_SIZE(obj) == 0) {
2792 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002793 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002794 }
2795 else {
2796 v = PyUnicode_Decode(
2797 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2798 encoding, errors);
2799 }
2800 return v;
2801 }
2802
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 PyErr_SetString(PyExc_TypeError,
2805 "decoding str is not supported");
2806 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002807 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002809 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2810 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2811 PyErr_Format(PyExc_TypeError,
2812 "coercing to str: need bytes, bytearray "
2813 "or buffer-like object, %.80s found",
2814 Py_TYPE(obj)->tp_name);
2815 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002816 }
Tim Petersced69f82003-09-16 20:30:58 +00002817
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002818 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002820 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Tim Petersced69f82003-09-16 20:30:58 +00002822 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002824
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002825 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002826 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827}
2828
Victor Stinner600d3be2010-06-10 12:00:55 +00002829/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002830 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2831 1 on success. */
2832static int
2833normalize_encoding(const char *encoding,
2834 char *lower,
2835 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002837 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002838 char *l;
2839 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002841 if (encoding == NULL) {
2842 strcpy(lower, "utf-8");
2843 return 1;
2844 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002845 e = encoding;
2846 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002847 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002848 while (*e) {
2849 if (l == l_end)
2850 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002851 if (Py_ISUPPER(*e)) {
2852 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002853 }
2854 else if (*e == '_') {
2855 *l++ = '-';
2856 e++;
2857 }
2858 else {
2859 *l++ = *e++;
2860 }
2861 }
2862 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002863 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002864}
2865
Alexander Belopolsky40018472011-02-26 01:02:56 +00002866PyObject *
2867PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002868 Py_ssize_t size,
2869 const char *encoding,
2870 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002871{
2872 PyObject *buffer = NULL, *unicode;
2873 Py_buffer info;
2874 char lower[11]; /* Enough for any encoding shortcut */
2875
Fred Drakee4315f52000-05-09 19:53:39 +00002876 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002877 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002878 if ((strcmp(lower, "utf-8") == 0) ||
2879 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002880 return PyUnicode_DecodeUTF8(s, size, errors);
2881 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002882 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002883 (strcmp(lower, "iso-8859-1") == 0))
2884 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002885#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002886 else if (strcmp(lower, "mbcs") == 0)
2887 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002888#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002889 else if (strcmp(lower, "ascii") == 0)
2890 return PyUnicode_DecodeASCII(s, size, errors);
2891 else if (strcmp(lower, "utf-16") == 0)
2892 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2893 else if (strcmp(lower, "utf-32") == 0)
2894 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2895 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002896
2897 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002898 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002899 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002900 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002901 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 if (buffer == NULL)
2903 goto onError;
2904 unicode = PyCodec_Decode(buffer, encoding, errors);
2905 if (unicode == NULL)
2906 goto onError;
2907 if (!PyUnicode_Check(unicode)) {
2908 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002909 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002910 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 Py_DECREF(unicode);
2912 goto onError;
2913 }
2914 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002915#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002916 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002917 Py_DECREF(unicode);
2918 return NULL;
2919 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002920#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002921 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002923
Benjamin Peterson29060642009-01-31 22:14:21 +00002924 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002925 Py_XDECREF(buffer);
2926 return NULL;
2927}
2928
Alexander Belopolsky40018472011-02-26 01:02:56 +00002929PyObject *
2930PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002931 const char *encoding,
2932 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002933{
2934 PyObject *v;
2935
2936 if (!PyUnicode_Check(unicode)) {
2937 PyErr_BadArgument();
2938 goto onError;
2939 }
2940
2941 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002942 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002943
2944 /* Decode via the codec registry */
2945 v = PyCodec_Decode(unicode, encoding, errors);
2946 if (v == NULL)
2947 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002948 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002949 return v;
2950
Benjamin Peterson29060642009-01-31 22:14:21 +00002951 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002952 return NULL;
2953}
2954
Alexander Belopolsky40018472011-02-26 01:02:56 +00002955PyObject *
2956PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002957 const char *encoding,
2958 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002959{
2960 PyObject *v;
2961
2962 if (!PyUnicode_Check(unicode)) {
2963 PyErr_BadArgument();
2964 goto onError;
2965 }
2966
2967 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002968 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002969
2970 /* Decode via the codec registry */
2971 v = PyCodec_Decode(unicode, encoding, errors);
2972 if (v == NULL)
2973 goto onError;
2974 if (!PyUnicode_Check(v)) {
2975 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002976 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002977 Py_TYPE(v)->tp_name);
2978 Py_DECREF(v);
2979 goto onError;
2980 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002981 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002982 return v;
2983
Benjamin Peterson29060642009-01-31 22:14:21 +00002984 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002985 return NULL;
2986}
2987
Alexander Belopolsky40018472011-02-26 01:02:56 +00002988PyObject *
2989PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002990 Py_ssize_t size,
2991 const char *encoding,
2992 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
2994 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002995
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 unicode = PyUnicode_FromUnicode(s, size);
2997 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002998 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3000 Py_DECREF(unicode);
3001 return v;
3002}
3003
Alexander Belopolsky40018472011-02-26 01:02:56 +00003004PyObject *
3005PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003006 const char *encoding,
3007 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008{
3009 PyObject *v;
3010
3011 if (!PyUnicode_Check(unicode)) {
3012 PyErr_BadArgument();
3013 goto onError;
3014 }
3015
3016 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003018
3019 /* Encode via the codec registry */
3020 v = PyCodec_Encode(unicode, encoding, errors);
3021 if (v == NULL)
3022 goto onError;
3023 return v;
3024
Benjamin Peterson29060642009-01-31 22:14:21 +00003025 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003026 return NULL;
3027}
3028
Victor Stinnerad158722010-10-27 00:25:46 +00003029PyObject *
3030PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003031{
Victor Stinner99b95382011-07-04 14:23:54 +02003032#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003033 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3034 PyUnicode_GET_SIZE(unicode),
3035 NULL);
3036#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003037 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003038#else
Victor Stinner793b5312011-04-27 00:24:21 +02003039 PyInterpreterState *interp = PyThreadState_GET()->interp;
3040 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3041 cannot use it to encode and decode filenames before it is loaded. Load
3042 the Python codec requires to encode at least its own filename. Use the C
3043 version of the locale codec until the codec registry is initialized and
3044 the Python codec is loaded.
3045
3046 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3047 cannot only rely on it: check also interp->fscodec_initialized for
3048 subinterpreters. */
3049 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003050 return PyUnicode_AsEncodedString(unicode,
3051 Py_FileSystemDefaultEncoding,
3052 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003053 }
3054 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003055 /* locale encoding with surrogateescape */
3056 wchar_t *wchar;
3057 char *bytes;
3058 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003059 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003060
3061 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3062 if (wchar == NULL)
3063 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003064 bytes = _Py_wchar2char(wchar, &error_pos);
3065 if (bytes == NULL) {
3066 if (error_pos != (size_t)-1) {
3067 char *errmsg = strerror(errno);
3068 PyObject *exc = NULL;
3069 if (errmsg == NULL)
3070 errmsg = "Py_wchar2char() failed";
3071 raise_encode_exception(&exc,
3072 "filesystemencoding",
3073 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3074 error_pos, error_pos+1,
3075 errmsg);
3076 Py_XDECREF(exc);
3077 }
3078 else
3079 PyErr_NoMemory();
3080 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003081 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003082 }
3083 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003084
3085 bytes_obj = PyBytes_FromString(bytes);
3086 PyMem_Free(bytes);
3087 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003088 }
Victor Stinnerad158722010-10-27 00:25:46 +00003089#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003090}
3091
Alexander Belopolsky40018472011-02-26 01:02:56 +00003092PyObject *
3093PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003094 const char *encoding,
3095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096{
3097 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003098 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003099
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
Fred Drakee4315f52000-05-09 19:53:39 +00003104
Fred Drakee4315f52000-05-09 19:53:39 +00003105 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003106 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003107 if ((strcmp(lower, "utf-8") == 0) ||
3108 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003109 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003110 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003111 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003112 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003114 }
Victor Stinner37296e82010-06-10 13:36:23 +00003115 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003116 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003117 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003119#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003120 else if (strcmp(lower, "mbcs") == 0)
3121 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3122 PyUnicode_GET_SIZE(unicode),
3123 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003124#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003125 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003126 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003127 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003128
3129 /* Encode via the codec registry */
3130 v = PyCodec_Encode(unicode, encoding, errors);
3131 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003132 return NULL;
3133
3134 /* The normal path */
3135 if (PyBytes_Check(v))
3136 return v;
3137
3138 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003139 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003140 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003141 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003142
3143 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3144 "encoder %s returned bytearray instead of bytes",
3145 encoding);
3146 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003147 Py_DECREF(v);
3148 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003149 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003150
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003151 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3152 Py_DECREF(v);
3153 return b;
3154 }
3155
3156 PyErr_Format(PyExc_TypeError,
3157 "encoder did not return a bytes object (type=%.400s)",
3158 Py_TYPE(v)->tp_name);
3159 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003160 return NULL;
3161}
3162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003165 const char *encoding,
3166 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003167{
3168 PyObject *v;
3169
3170 if (!PyUnicode_Check(unicode)) {
3171 PyErr_BadArgument();
3172 goto onError;
3173 }
3174
3175 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003176 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003177
3178 /* Encode via the codec registry */
3179 v = PyCodec_Encode(unicode, encoding, errors);
3180 if (v == NULL)
3181 goto onError;
3182 if (!PyUnicode_Check(v)) {
3183 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003184 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003185 Py_TYPE(v)->tp_name);
3186 Py_DECREF(v);
3187 goto onError;
3188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003190
Benjamin Peterson29060642009-01-31 22:14:21 +00003191 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192 return NULL;
3193}
3194
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003195PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003196PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003197 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003198 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3199}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003200
Christian Heimes5894ba72007-11-04 11:43:14 +00003201PyObject*
3202PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3203{
Victor Stinner99b95382011-07-04 14:23:54 +02003204#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003205 return PyUnicode_DecodeMBCS(s, size, NULL);
3206#elif defined(__APPLE__)
3207 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3208#else
Victor Stinner793b5312011-04-27 00:24:21 +02003209 PyInterpreterState *interp = PyThreadState_GET()->interp;
3210 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3211 cannot use it to encode and decode filenames before it is loaded. Load
3212 the Python codec requires to encode at least its own filename. Use the C
3213 version of the locale codec until the codec registry is initialized and
3214 the Python codec is loaded.
3215
3216 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3217 cannot only rely on it: check also interp->fscodec_initialized for
3218 subinterpreters. */
3219 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003220 return PyUnicode_Decode(s, size,
3221 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003222 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003223 }
3224 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003225 /* locale encoding with surrogateescape */
3226 wchar_t *wchar;
3227 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003228 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003229
3230 if (s[size] != '\0' || size != strlen(s)) {
3231 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3232 return NULL;
3233 }
3234
Victor Stinner168e1172010-10-16 23:16:16 +00003235 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003236 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003237 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003238
Victor Stinner168e1172010-10-16 23:16:16 +00003239 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003240 PyMem_Free(wchar);
3241 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003242 }
Victor Stinnerad158722010-10-27 00:25:46 +00003243#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003244}
3245
Martin v. Löwis011e8422009-05-05 04:43:17 +00003246
3247int
3248PyUnicode_FSConverter(PyObject* arg, void* addr)
3249{
3250 PyObject *output = NULL;
3251 Py_ssize_t size;
3252 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003253 if (arg == NULL) {
3254 Py_DECREF(*(PyObject**)addr);
3255 return 1;
3256 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003257 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003258 output = arg;
3259 Py_INCREF(output);
3260 }
3261 else {
3262 arg = PyUnicode_FromObject(arg);
3263 if (!arg)
3264 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003265 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003266 Py_DECREF(arg);
3267 if (!output)
3268 return 0;
3269 if (!PyBytes_Check(output)) {
3270 Py_DECREF(output);
3271 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3272 return 0;
3273 }
3274 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003275 size = PyBytes_GET_SIZE(output);
3276 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003277 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003278 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003279 Py_DECREF(output);
3280 return 0;
3281 }
3282 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003283 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003284}
3285
3286
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003287int
3288PyUnicode_FSDecoder(PyObject* arg, void* addr)
3289{
3290 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003291 if (arg == NULL) {
3292 Py_DECREF(*(PyObject**)addr);
3293 return 1;
3294 }
3295 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003296 if (PyUnicode_READY(arg))
3297 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003298 output = arg;
3299 Py_INCREF(output);
3300 }
3301 else {
3302 arg = PyBytes_FromObject(arg);
3303 if (!arg)
3304 return 0;
3305 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3306 PyBytes_GET_SIZE(arg));
3307 Py_DECREF(arg);
3308 if (!output)
3309 return 0;
3310 if (!PyUnicode_Check(output)) {
3311 Py_DECREF(output);
3312 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3313 return 0;
3314 }
3315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003316 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003317 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003318 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3319 Py_DECREF(output);
3320 return 0;
3321 }
3322 *(PyObject**)addr = output;
3323 return Py_CLEANUP_SUPPORTED;
3324}
3325
3326
Martin v. Löwis5b222132007-06-10 09:51:05 +00003327char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003329{
Christian Heimesf3863112007-11-22 07:46:41 +00003330 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003331 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3332
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003333 if (!PyUnicode_Check(unicode)) {
3334 PyErr_BadArgument();
3335 return NULL;
3336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003338 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003339
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003340 if (PyUnicode_UTF8(unicode) == NULL) {
3341 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3343 if (bytes == NULL)
3344 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003345 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3346 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 Py_DECREF(bytes);
3348 return NULL;
3349 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003350 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3351 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 Py_DECREF(bytes);
3353 }
3354
3355 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003356 *psize = PyUnicode_UTF8_LENGTH(unicode);
3357 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003358}
3359
3360char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003363 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3364}
3365
3366#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003367static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003368#endif
3369
3370
3371Py_UNICODE *
3372PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3373{
3374 PyUnicodeObject *u;
3375 const unsigned char *one_byte;
3376#if SIZEOF_WCHAR_T == 4
3377 const Py_UCS2 *two_bytes;
3378#else
3379 const Py_UCS4 *four_bytes;
3380 const Py_UCS4 *ucs4_end;
3381 Py_ssize_t num_surrogates;
3382#endif
3383 wchar_t *w;
3384 wchar_t *wchar_end;
3385
3386 if (!PyUnicode_Check(unicode)) {
3387 PyErr_BadArgument();
3388 return NULL;
3389 }
3390 u = (PyUnicodeObject*)unicode;
3391 if (_PyUnicode_WSTR(u) == NULL) {
3392 /* Non-ASCII compact unicode object */
3393 assert(_PyUnicode_KIND(u) != 0);
3394 assert(PyUnicode_IS_READY(u));
3395
3396#ifdef Py_DEBUG
3397 ++unicode_as_unicode_calls;
3398#endif
3399
3400 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3401#if SIZEOF_WCHAR_T == 2
3402 four_bytes = PyUnicode_4BYTE_DATA(u);
3403 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3404 num_surrogates = 0;
3405
3406 for (; four_bytes < ucs4_end; ++four_bytes) {
3407 if (*four_bytes > 0xFFFF)
3408 ++num_surrogates;
3409 }
3410
3411 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3412 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3413 if (!_PyUnicode_WSTR(u)) {
3414 PyErr_NoMemory();
3415 return NULL;
3416 }
3417 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3418
3419 w = _PyUnicode_WSTR(u);
3420 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3421 four_bytes = PyUnicode_4BYTE_DATA(u);
3422 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3423 if (*four_bytes > 0xFFFF) {
3424 /* encode surrogate pair in this case */
3425 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3426 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3427 }
3428 else
3429 *w = *four_bytes;
3430
3431 if (w > wchar_end) {
3432 assert(0 && "Miscalculated string end");
3433 }
3434 }
3435 *w = 0;
3436#else
3437 /* sizeof(wchar_t) == 4 */
3438 Py_FatalError("Impossible unicode object state, wstr and str "
3439 "should share memory already.");
3440 return NULL;
3441#endif
3442 }
3443 else {
3444 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3445 (_PyUnicode_LENGTH(u) + 1));
3446 if (!_PyUnicode_WSTR(u)) {
3447 PyErr_NoMemory();
3448 return NULL;
3449 }
3450 if (!PyUnicode_IS_COMPACT_ASCII(u))
3451 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3452 w = _PyUnicode_WSTR(u);
3453 wchar_end = w + _PyUnicode_LENGTH(u);
3454
3455 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3456 one_byte = PyUnicode_1BYTE_DATA(u);
3457 for (; w < wchar_end; ++one_byte, ++w)
3458 *w = *one_byte;
3459 /* null-terminate the wstr */
3460 *w = 0;
3461 }
3462 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3463#if SIZEOF_WCHAR_T == 4
3464 two_bytes = PyUnicode_2BYTE_DATA(u);
3465 for (; w < wchar_end; ++two_bytes, ++w)
3466 *w = *two_bytes;
3467 /* null-terminate the wstr */
3468 *w = 0;
3469#else
3470 /* sizeof(wchar_t) == 2 */
3471 PyObject_FREE(_PyUnicode_WSTR(u));
3472 _PyUnicode_WSTR(u) = NULL;
3473 Py_FatalError("Impossible unicode object state, wstr "
3474 "and str should share memory already.");
3475 return NULL;
3476#endif
3477 }
3478 else {
3479 assert(0 && "This should never happen.");
3480 }
3481 }
3482 }
3483 if (size != NULL)
3484 *size = PyUnicode_WSTR_LENGTH(u);
3485 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003486}
3487
Alexander Belopolsky40018472011-02-26 01:02:56 +00003488Py_UNICODE *
3489PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003491 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003492}
3493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003494
Alexander Belopolsky40018472011-02-26 01:02:56 +00003495Py_ssize_t
3496PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497{
3498 if (!PyUnicode_Check(unicode)) {
3499 PyErr_BadArgument();
3500 goto onError;
3501 }
3502 return PyUnicode_GET_SIZE(unicode);
3503
Benjamin Peterson29060642009-01-31 22:14:21 +00003504 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003505 return -1;
3506}
3507
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003508Py_ssize_t
3509PyUnicode_GetLength(PyObject *unicode)
3510{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003511 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512 PyErr_BadArgument();
3513 return -1;
3514 }
3515
3516 return PyUnicode_GET_LENGTH(unicode);
3517}
3518
3519Py_UCS4
3520PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3521{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003522 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3523 PyErr_BadArgument();
3524 return (Py_UCS4)-1;
3525 }
3526 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3527 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003528 return (Py_UCS4)-1;
3529 }
3530 return PyUnicode_READ_CHAR(unicode, index);
3531}
3532
3533int
3534PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3535{
3536 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003537 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003538 return -1;
3539 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003540 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3541 PyErr_SetString(PyExc_IndexError, "string index out of range");
3542 return -1;
3543 }
3544 if (_PyUnicode_Dirty(unicode))
3545 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003546 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3547 index, ch);
3548 return 0;
3549}
3550
Alexander Belopolsky40018472011-02-26 01:02:56 +00003551const char *
3552PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003553{
Victor Stinner42cb4622010-09-01 19:39:01 +00003554 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003555}
3556
Victor Stinner554f3f02010-06-16 23:33:54 +00003557/* create or adjust a UnicodeDecodeError */
3558static void
3559make_decode_exception(PyObject **exceptionObject,
3560 const char *encoding,
3561 const char *input, Py_ssize_t length,
3562 Py_ssize_t startpos, Py_ssize_t endpos,
3563 const char *reason)
3564{
3565 if (*exceptionObject == NULL) {
3566 *exceptionObject = PyUnicodeDecodeError_Create(
3567 encoding, input, length, startpos, endpos, reason);
3568 }
3569 else {
3570 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3571 goto onError;
3572 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3573 goto onError;
3574 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3575 goto onError;
3576 }
3577 return;
3578
3579onError:
3580 Py_DECREF(*exceptionObject);
3581 *exceptionObject = NULL;
3582}
3583
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584/* error handling callback helper:
3585 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003586 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003587 and adjust various state variables.
3588 return 0 on success, -1 on error
3589*/
3590
Alexander Belopolsky40018472011-02-26 01:02:56 +00003591static int
3592unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003593 const char *encoding, const char *reason,
3594 const char **input, const char **inend, Py_ssize_t *startinpos,
3595 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3596 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003597{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003598 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003599
3600 PyObject *restuple = NULL;
3601 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003602 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003603 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003604 Py_ssize_t requiredsize;
3605 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003607 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003608 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003609 int res = -1;
3610
3611 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003612 *errorHandler = PyCodec_LookupError(errors);
3613 if (*errorHandler == NULL)
3614 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 }
3616
Victor Stinner554f3f02010-06-16 23:33:54 +00003617 make_decode_exception(exceptionObject,
3618 encoding,
3619 *input, *inend - *input,
3620 *startinpos, *endinpos,
3621 reason);
3622 if (*exceptionObject == NULL)
3623 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624
3625 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3626 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003629 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003631 }
3632 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003634
3635 /* Copy back the bytes variables, which might have been modified by the
3636 callback */
3637 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3638 if (!inputobj)
3639 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003640 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003641 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003642 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003643 *input = PyBytes_AS_STRING(inputobj);
3644 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003645 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003646 /* we can DECREF safely, as the exception has another reference,
3647 so the object won't go away. */
3648 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003649
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003651 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003652 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003653 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3654 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003655 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656
3657 /* need more space? (at least enough for what we
3658 have+the replacement+the rest of the string (starting
3659 at the new input position), so we won't have to check space
3660 when there are no errors in the rest of the string) */
3661 repptr = PyUnicode_AS_UNICODE(repunicode);
3662 repsize = PyUnicode_GET_SIZE(repunicode);
3663 requiredsize = *outpos + repsize + insize-newpos;
3664 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 if (requiredsize<2*outsize)
3666 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003667 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003668 goto onError;
3669 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 }
3671 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003672 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 Py_UNICODE_COPY(*outptr, repptr, repsize);
3674 *outptr += repsize;
3675 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003676
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 /* we made it! */
3678 res = 0;
3679
Benjamin Peterson29060642009-01-31 22:14:21 +00003680 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 Py_XDECREF(restuple);
3682 return res;
3683}
3684
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003685/* --- UTF-7 Codec -------------------------------------------------------- */
3686
Antoine Pitrou244651a2009-05-04 18:56:13 +00003687/* See RFC2152 for details. We encode conservatively and decode liberally. */
3688
3689/* Three simple macros defining base-64. */
3690
3691/* Is c a base-64 character? */
3692
3693#define IS_BASE64(c) \
3694 (((c) >= 'A' && (c) <= 'Z') || \
3695 ((c) >= 'a' && (c) <= 'z') || \
3696 ((c) >= '0' && (c) <= '9') || \
3697 (c) == '+' || (c) == '/')
3698
3699/* given that c is a base-64 character, what is its base-64 value? */
3700
3701#define FROM_BASE64(c) \
3702 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3703 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3704 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3705 (c) == '+' ? 62 : 63)
3706
3707/* What is the base-64 character of the bottom 6 bits of n? */
3708
3709#define TO_BASE64(n) \
3710 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3711
3712/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3713 * decoded as itself. We are permissive on decoding; the only ASCII
3714 * byte not decoding to itself is the + which begins a base64
3715 * string. */
3716
3717#define DECODE_DIRECT(c) \
3718 ((c) <= 127 && (c) != '+')
3719
3720/* The UTF-7 encoder treats ASCII characters differently according to
3721 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3722 * the above). See RFC2152. This array identifies these different
3723 * sets:
3724 * 0 : "Set D"
3725 * alphanumeric and '(),-./:?
3726 * 1 : "Set O"
3727 * !"#$%&*;<=>@[]^_`{|}
3728 * 2 : "whitespace"
3729 * ht nl cr sp
3730 * 3 : special (must be base64 encoded)
3731 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3732 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003733
Tim Petersced69f82003-09-16 20:30:58 +00003734static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003735char utf7_category[128] = {
3736/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3737 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3738/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3739 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3740/* sp ! " # $ % & ' ( ) * + , - . / */
3741 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3742/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3743 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3744/* @ A B C D E F G H I J K L M N O */
3745 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3746/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3748/* ` a b c d e f g h i j k l m n o */
3749 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3750/* p q r s t u v w x y z { | } ~ del */
3751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003752};
3753
Antoine Pitrou244651a2009-05-04 18:56:13 +00003754/* ENCODE_DIRECT: this character should be encoded as itself. The
3755 * answer depends on whether we are encoding set O as itself, and also
3756 * on whether we are encoding whitespace as itself. RFC2152 makes it
3757 * clear that the answers to these questions vary between
3758 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003759
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760#define ENCODE_DIRECT(c, directO, directWS) \
3761 ((c) < 128 && (c) > 0 && \
3762 ((utf7_category[(c)] == 0) || \
3763 (directWS && (utf7_category[(c)] == 2)) || \
3764 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003765
Alexander Belopolsky40018472011-02-26 01:02:56 +00003766PyObject *
3767PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003768 Py_ssize_t size,
3769 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003770{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003771 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3772}
3773
Antoine Pitrou244651a2009-05-04 18:56:13 +00003774/* The decoder. The only state we preserve is our read position,
3775 * i.e. how many characters we have consumed. So if we end in the
3776 * middle of a shift sequence we have to back off the read position
3777 * and the output to the beginning of the sequence, otherwise we lose
3778 * all the shift state (seen bits, number of bits seen, high
3779 * surrogate). */
3780
Alexander Belopolsky40018472011-02-26 01:02:56 +00003781PyObject *
3782PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003783 Py_ssize_t size,
3784 const char *errors,
3785 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003786{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003787 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003788 Py_ssize_t startinpos;
3789 Py_ssize_t endinpos;
3790 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003791 const char *e;
3792 PyUnicodeObject *unicode;
3793 Py_UNICODE *p;
3794 const char *errmsg = "";
3795 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003796 Py_UNICODE *shiftOutStart;
3797 unsigned int base64bits = 0;
3798 unsigned long base64buffer = 0;
3799 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003800 PyObject *errorHandler = NULL;
3801 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003802
3803 unicode = _PyUnicode_New(size);
3804 if (!unicode)
3805 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003806 if (size == 0) {
3807 if (consumed)
3808 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003810 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003812 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003813 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003814 e = s + size;
3815
3816 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003817 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003818 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003819 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820
Antoine Pitrou244651a2009-05-04 18:56:13 +00003821 if (inShift) { /* in a base-64 section */
3822 if (IS_BASE64(ch)) { /* consume a base-64 character */
3823 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3824 base64bits += 6;
3825 s++;
3826 if (base64bits >= 16) {
3827 /* we have enough bits for a UTF-16 value */
3828 Py_UNICODE outCh = (Py_UNICODE)
3829 (base64buffer >> (base64bits-16));
3830 base64bits -= 16;
3831 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3832 if (surrogate) {
3833 /* expecting a second surrogate */
3834 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3835#ifdef Py_UNICODE_WIDE
3836 *p++ = (((surrogate & 0x3FF)<<10)
3837 | (outCh & 0x3FF)) + 0x10000;
3838#else
3839 *p++ = surrogate;
3840 *p++ = outCh;
3841#endif
3842 surrogate = 0;
3843 }
3844 else {
3845 surrogate = 0;
3846 errmsg = "second surrogate missing";
3847 goto utf7Error;
3848 }
3849 }
3850 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3851 /* first surrogate */
3852 surrogate = outCh;
3853 }
3854 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3855 errmsg = "unexpected second surrogate";
3856 goto utf7Error;
3857 }
3858 else {
3859 *p++ = outCh;
3860 }
3861 }
3862 }
3863 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003864 inShift = 0;
3865 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003866 if (surrogate) {
3867 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003868 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003869 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003870 if (base64bits > 0) { /* left-over bits */
3871 if (base64bits >= 6) {
3872 /* We've seen at least one base-64 character */
3873 errmsg = "partial character in shift sequence";
3874 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003876 else {
3877 /* Some bits remain; they should be zero */
3878 if (base64buffer != 0) {
3879 errmsg = "non-zero padding bits in shift sequence";
3880 goto utf7Error;
3881 }
3882 }
3883 }
3884 if (ch != '-') {
3885 /* '-' is absorbed; other terminating
3886 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003887 *p++ = ch;
3888 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003889 }
3890 }
3891 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003892 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003893 s++; /* consume '+' */
3894 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895 s++;
3896 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 }
3898 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003900 shiftOutStart = p;
3901 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003902 }
3903 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905 *p++ = ch;
3906 s++;
3907 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908 else {
3909 startinpos = s-starts;
3910 s++;
3911 errmsg = "unexpected special character";
3912 goto utf7Error;
3913 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003914 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003915utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003916 outpos = p-PyUnicode_AS_UNICODE(unicode);
3917 endinpos = s-starts;
3918 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003919 errors, &errorHandler,
3920 "utf7", errmsg,
3921 &starts, &e, &startinpos, &endinpos, &exc, &s,
3922 &unicode, &outpos, &p))
3923 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003924 }
3925
Antoine Pitrou244651a2009-05-04 18:56:13 +00003926 /* end of string */
3927
3928 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3929 /* if we're in an inconsistent state, that's an error */
3930 if (surrogate ||
3931 (base64bits >= 6) ||
3932 (base64bits > 0 && base64buffer != 0)) {
3933 outpos = p-PyUnicode_AS_UNICODE(unicode);
3934 endinpos = size;
3935 if (unicode_decode_call_errorhandler(
3936 errors, &errorHandler,
3937 "utf7", "unterminated shift sequence",
3938 &starts, &e, &startinpos, &endinpos, &exc, &s,
3939 &unicode, &outpos, &p))
3940 goto onError;
3941 if (s < e)
3942 goto restart;
3943 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003944 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945
3946 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003947 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 if (inShift) {
3949 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003950 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 }
3952 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003953 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003954 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003955 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003956
Victor Stinnerfe226c02011-10-03 03:52:20 +02003957 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003958 goto onError;
3959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003960 Py_XDECREF(errorHandler);
3961 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003962#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003963 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964 Py_DECREF(unicode);
3965 return NULL;
3966 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003967#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003968 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003969 return (PyObject *)unicode;
3970
Benjamin Peterson29060642009-01-31 22:14:21 +00003971 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 Py_XDECREF(errorHandler);
3973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 Py_DECREF(unicode);
3975 return NULL;
3976}
3977
3978
Alexander Belopolsky40018472011-02-26 01:02:56 +00003979PyObject *
3980PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003981 Py_ssize_t size,
3982 int base64SetO,
3983 int base64WhiteSpace,
3984 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003985{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003986 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003987 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003988 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003989 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003990 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003991 unsigned int base64bits = 0;
3992 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993 char * out;
3994 char * start;
3995
3996 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003997 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003999 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004000 return PyErr_NoMemory();
4001
Antoine Pitrou244651a2009-05-04 18:56:13 +00004002 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004003 if (v == NULL)
4004 return NULL;
4005
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004006 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007 for (;i < size; ++i) {
4008 Py_UNICODE ch = s[i];
4009
Antoine Pitrou244651a2009-05-04 18:56:13 +00004010 if (inShift) {
4011 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4012 /* shifting out */
4013 if (base64bits) { /* output remaining bits */
4014 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4015 base64buffer = 0;
4016 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004017 }
4018 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004019 /* Characters not in the BASE64 set implicitly unshift the sequence
4020 so no '-' is required, except if the character is itself a '-' */
4021 if (IS_BASE64(ch) || ch == '-') {
4022 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024 *out++ = (char) ch;
4025 }
4026 else {
4027 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004028 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004029 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 else { /* not in a shift sequence */
4031 if (ch == '+') {
4032 *out++ = '+';
4033 *out++ = '-';
4034 }
4035 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4036 *out++ = (char) ch;
4037 }
4038 else {
4039 *out++ = '+';
4040 inShift = 1;
4041 goto encode_char;
4042 }
4043 }
4044 continue;
4045encode_char:
4046#ifdef Py_UNICODE_WIDE
4047 if (ch >= 0x10000) {
4048 /* code first surrogate */
4049 base64bits += 16;
4050 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4051 while (base64bits >= 6) {
4052 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4053 base64bits -= 6;
4054 }
4055 /* prepare second surrogate */
4056 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4057 }
4058#endif
4059 base64bits += 16;
4060 base64buffer = (base64buffer << 16) | ch;
4061 while (base64bits >= 6) {
4062 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4063 base64bits -= 6;
4064 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004065 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004066 if (base64bits)
4067 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4068 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004069 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004070 if (_PyBytes_Resize(&v, out - start) < 0)
4071 return NULL;
4072 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004073}
4074
Antoine Pitrou244651a2009-05-04 18:56:13 +00004075#undef IS_BASE64
4076#undef FROM_BASE64
4077#undef TO_BASE64
4078#undef DECODE_DIRECT
4079#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004080
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081/* --- UTF-8 Codec -------------------------------------------------------- */
4082
Tim Petersced69f82003-09-16 20:30:58 +00004083static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004085 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4086 illegal prefix. See RFC 3629 for details */
4087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004098 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4099 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4100 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4101 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4102 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103};
4104
Alexander Belopolsky40018472011-02-26 01:02:56 +00004105PyObject *
4106PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004107 Py_ssize_t size,
4108 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109{
Walter Dörwald69652032004-09-07 20:24:22 +00004110 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4111}
4112
Antoine Pitrouab868312009-01-10 15:40:25 +00004113/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4114#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4115
4116/* Mask to quickly check whether a C 'long' contains a
4117 non-ASCII, UTF8-encoded char. */
4118#if (SIZEOF_LONG == 8)
4119# define ASCII_CHAR_MASK 0x8080808080808080L
4120#elif (SIZEOF_LONG == 4)
4121# define ASCII_CHAR_MASK 0x80808080L
4122#else
4123# error C 'long' size should be either 4 or 8!
4124#endif
4125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004126/* Scans a UTF-8 string and returns the maximum character to be expected,
4127 the size of the decoded unicode string and if any major errors were
4128 encountered.
4129
4130 This function does check basic UTF-8 sanity, it does however NOT CHECK
4131 if the string contains surrogates, and if all continuation bytes are
4132 within the correct ranges, these checks are performed in
4133 PyUnicode_DecodeUTF8Stateful.
4134
4135 If it sets has_errors to 1, it means the value of unicode_size and max_char
4136 will be bogus and you should not rely on useful information in them.
4137 */
4138static Py_UCS4
4139utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4140 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4141 int *has_errors)
4142{
4143 Py_ssize_t n;
4144 Py_ssize_t char_count = 0;
4145 Py_UCS4 max_char = 127, new_max;
4146 Py_UCS4 upper_bound;
4147 const unsigned char *p = (const unsigned char *)s;
4148 const unsigned char *end = p + string_size;
4149 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4150 int err = 0;
4151
4152 for (; p < end && !err; ++p, ++char_count) {
4153 /* Only check value if it's not a ASCII char... */
4154 if (*p < 0x80) {
4155 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4156 an explanation. */
4157 if (!((size_t) p & LONG_PTR_MASK)) {
4158 /* Help register allocation */
4159 register const unsigned char *_p = p;
4160 while (_p < aligned_end) {
4161 unsigned long value = *(unsigned long *) _p;
4162 if (value & ASCII_CHAR_MASK)
4163 break;
4164 _p += SIZEOF_LONG;
4165 char_count += SIZEOF_LONG;
4166 }
4167 p = _p;
4168 if (p == end)
4169 break;
4170 }
4171 }
4172 if (*p >= 0x80) {
4173 n = utf8_code_length[*p];
4174 new_max = max_char;
4175 switch (n) {
4176 /* invalid start byte */
4177 case 0:
4178 err = 1;
4179 break;
4180 case 2:
4181 /* Code points between 0x00FF and 0x07FF inclusive.
4182 Approximate the upper bound of the code point,
4183 if this flips over 255 we can be sure it will be more
4184 than 255 and the string will need 2 bytes per code coint,
4185 if it stays under or equal to 255, we can be sure 1 byte
4186 is enough.
4187 ((*p & 0b00011111) << 6) | 0b00111111 */
4188 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4189 if (max_char < upper_bound)
4190 new_max = upper_bound;
4191 /* Ensure we track at least that we left ASCII space. */
4192 if (new_max < 128)
4193 new_max = 128;
4194 break;
4195 case 3:
4196 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4197 always > 255 and <= 65535 and will always need 2 bytes. */
4198 if (max_char < 65535)
4199 new_max = 65535;
4200 break;
4201 case 4:
4202 /* Code point will be above 0xFFFF for sure in this case. */
4203 new_max = 65537;
4204 break;
4205 /* Internal error, this should be caught by the first if */
4206 case 1:
4207 default:
4208 assert(0 && "Impossible case in utf8_max_char_and_size");
4209 err = 1;
4210 }
4211 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004212 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 --n;
4214 /* Check if the follow up chars are all valid continuation bytes */
4215 if (n >= 1) {
4216 const unsigned char *cont;
4217 if ((p + n) >= end) {
4218 if (consumed == 0)
4219 /* incomplete data, non-incremental decoding */
4220 err = 1;
4221 break;
4222 }
4223 for (cont = p + 1; cont < (p + n); ++cont) {
4224 if ((*cont & 0xc0) != 0x80) {
4225 err = 1;
4226 break;
4227 }
4228 }
4229 p += n;
4230 }
4231 else
4232 err = 1;
4233 max_char = new_max;
4234 }
4235 }
4236
4237 if (unicode_size)
4238 *unicode_size = char_count;
4239 if (has_errors)
4240 *has_errors = err;
4241 return max_char;
4242}
4243
4244/* Similar to PyUnicode_WRITE but can also write into wstr field
4245 of the legacy unicode representation */
4246#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4247 do { \
4248 const int k_ = (kind); \
4249 if (k_ == PyUnicode_WCHAR_KIND) \
4250 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4251 else if (k_ == PyUnicode_1BYTE_KIND) \
4252 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4253 else if (k_ == PyUnicode_2BYTE_KIND) \
4254 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4255 else \
4256 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4257 } while (0)
4258
Alexander Belopolsky40018472011-02-26 01:02:56 +00004259PyObject *
4260PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004261 Py_ssize_t size,
4262 const char *errors,
4263 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004264{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004267 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004268 Py_ssize_t startinpos;
4269 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004270 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004272 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004273 PyObject *errorHandler = NULL;
4274 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004275 Py_UCS4 maxchar = 0;
4276 Py_ssize_t unicode_size;
4277 Py_ssize_t i;
4278 int kind;
4279 void *data;
4280 int has_errors;
4281 Py_UNICODE *error_outptr;
4282#if SIZEOF_WCHAR_T == 2
4283 Py_ssize_t wchar_offset = 0;
4284#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285
Walter Dörwald69652032004-09-07 20:24:22 +00004286 if (size == 0) {
4287 if (consumed)
4288 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004290 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4292 consumed, &has_errors);
4293 if (has_errors) {
4294 unicode = _PyUnicode_New(size);
4295 if (!unicode)
4296 return NULL;
4297 kind = PyUnicode_WCHAR_KIND;
4298 data = PyUnicode_AS_UNICODE(unicode);
4299 assert(data != NULL);
4300 }
4301 else {
4302 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4303 if (!unicode)
4304 return NULL;
4305 /* When the string is ASCII only, just use memcpy and return.
4306 unicode_size may be != size if there is an incomplete UTF-8
4307 sequence at the end of the ASCII block. */
4308 if (maxchar < 128 && size == unicode_size) {
4309 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4310 return (PyObject *)unicode;
4311 }
4312 kind = PyUnicode_KIND(unicode);
4313 data = PyUnicode_DATA(unicode);
4314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004316 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004318 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
4320 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004321 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322
4323 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004324 /* Fast path for runs of ASCII characters. Given that common UTF-8
4325 input will consist of an overwhelming majority of ASCII
4326 characters, we try to optimize for this case by checking
4327 as many characters as a C 'long' can contain.
4328 First, check if we can do an aligned read, as most CPUs have
4329 a penalty for unaligned reads.
4330 */
4331 if (!((size_t) s & LONG_PTR_MASK)) {
4332 /* Help register allocation */
4333 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004335 while (_s < aligned_end) {
4336 /* Read a whole long at a time (either 4 or 8 bytes),
4337 and do a fast unrolled copy if it only contains ASCII
4338 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 unsigned long value = *(unsigned long *) _s;
4340 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004341 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004342 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4343 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4344 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4345 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004346#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004347 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4348 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4350 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004351#endif
4352 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004354 }
4355 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004356 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004357 if (s == e)
4358 break;
4359 ch = (unsigned char)*s;
4360 }
4361 }
4362
4363 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004364 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365 s++;
4366 continue;
4367 }
4368
4369 n = utf8_code_length[ch];
4370
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004371 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004372 if (consumed)
4373 break;
4374 else {
4375 errmsg = "unexpected end of data";
4376 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004377 endinpos = startinpos+1;
4378 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4379 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004380 goto utf8Error;
4381 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004382 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
4384 switch (n) {
4385
4386 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004387 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004388 startinpos = s-starts;
4389 endinpos = startinpos+1;
4390 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391
4392 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004393 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 startinpos = s-starts;
4395 endinpos = startinpos+1;
4396 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397
4398 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004399 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004400 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004401 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004402 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 goto utf8Error;
4404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004407 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 break;
4409
4410 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004411 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4412 will result in surrogates in range d800-dfff. Surrogates are
4413 not valid UTF-8 so they are rejected.
4414 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4415 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004416 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004417 (s[2] & 0xc0) != 0x80 ||
4418 ((unsigned char)s[0] == 0xE0 &&
4419 (unsigned char)s[1] < 0xA0) ||
4420 ((unsigned char)s[0] == 0xED &&
4421 (unsigned char)s[1] > 0x9F)) {
4422 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004423 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004424 endinpos = startinpos + 1;
4425
4426 /* if s[1] first two bits are 1 and 0, then the invalid
4427 continuation byte is s[2], so increment endinpos by 1,
4428 if not, s[1] is invalid and endinpos doesn't need to
4429 be incremented. */
4430 if ((s[1] & 0xC0) == 0x80)
4431 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 goto utf8Error;
4433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004435 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004436 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004437 break;
4438
4439 case 4:
4440 if ((s[1] & 0xc0) != 0x80 ||
4441 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004442 (s[3] & 0xc0) != 0x80 ||
4443 ((unsigned char)s[0] == 0xF0 &&
4444 (unsigned char)s[1] < 0x90) ||
4445 ((unsigned char)s[0] == 0xF4 &&
4446 (unsigned char)s[1] > 0x8F)) {
4447 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004448 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004449 endinpos = startinpos + 1;
4450 if ((s[1] & 0xC0) == 0x80) {
4451 endinpos++;
4452 if ((s[2] & 0xC0) == 0x80)
4453 endinpos++;
4454 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004455 goto utf8Error;
4456 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004457 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004458 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4459 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004461 /* If the string is flexible or we have native UCS-4, write
4462 directly.. */
4463 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4464 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004466 else {
4467 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004469 /* translate from 10000..10FFFF to 0..FFFF */
4470 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 /* high surrogate = top 10 bits added to D800 */
4473 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4474 (Py_UNICODE)(0xD800 + (ch >> 10)));
4475
4476 /* low surrogate = bottom 10 bits added to DC00 */
4477 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4478 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4479 }
4480#if SIZEOF_WCHAR_T == 2
4481 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004482#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 }
4485 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004486 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004487
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004489 /* If this is not yet a resizable string, make it one.. */
4490 if (kind != PyUnicode_WCHAR_KIND) {
4491 const Py_UNICODE *u;
4492 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4493 if (!new_unicode)
4494 goto onError;
4495 u = PyUnicode_AsUnicode((PyObject *)unicode);
4496 if (!u)
4497 goto onError;
4498#if SIZEOF_WCHAR_T == 2
4499 i += wchar_offset;
4500#endif
4501 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4502 Py_DECREF(unicode);
4503 unicode = new_unicode;
4504 kind = 0;
4505 data = PyUnicode_AS_UNICODE(new_unicode);
4506 assert(data != NULL);
4507 }
4508 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 if (unicode_decode_call_errorhandler(
4510 errors, &errorHandler,
4511 "utf8", errmsg,
4512 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004513 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 /* Update data because unicode_decode_call_errorhandler might have
4516 re-created or resized the unicode object. */
4517 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 /* Ensure the unicode_size calculation above was correct: */
4521 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4522
Walter Dörwald69652032004-09-07 20:24:22 +00004523 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 /* Adjust length and ready string when it contained errors and
4527 is of the old resizable kind. */
4528 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004529 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 goto onError;
4531 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004532
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004533 Py_XDECREF(errorHandler);
4534 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004535#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004536 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537 Py_DECREF(unicode);
4538 return NULL;
4539 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004540#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004541 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 return (PyObject *)unicode;
4543
Benjamin Peterson29060642009-01-31 22:14:21 +00004544 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 Py_XDECREF(errorHandler);
4546 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 Py_DECREF(unicode);
4548 return NULL;
4549}
4550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004551#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004552
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004553#ifdef __APPLE__
4554
4555/* Simplified UTF-8 decoder using surrogateescape error handler,
4556 used to decode the command line arguments on Mac OS X. */
4557
4558wchar_t*
4559_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4560{
4561 int n;
4562 const char *e;
4563 wchar_t *unicode, *p;
4564
4565 /* Note: size will always be longer than the resulting Unicode
4566 character count */
4567 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4568 PyErr_NoMemory();
4569 return NULL;
4570 }
4571 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4572 if (!unicode)
4573 return NULL;
4574
4575 /* Unpack UTF-8 encoded data */
4576 p = unicode;
4577 e = s + size;
4578 while (s < e) {
4579 Py_UCS4 ch = (unsigned char)*s;
4580
4581 if (ch < 0x80) {
4582 *p++ = (wchar_t)ch;
4583 s++;
4584 continue;
4585 }
4586
4587 n = utf8_code_length[ch];
4588 if (s + n > e) {
4589 goto surrogateescape;
4590 }
4591
4592 switch (n) {
4593 case 0:
4594 case 1:
4595 goto surrogateescape;
4596
4597 case 2:
4598 if ((s[1] & 0xc0) != 0x80)
4599 goto surrogateescape;
4600 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4601 assert ((ch > 0x007F) && (ch <= 0x07FF));
4602 *p++ = (wchar_t)ch;
4603 break;
4604
4605 case 3:
4606 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4607 will result in surrogates in range d800-dfff. Surrogates are
4608 not valid UTF-8 so they are rejected.
4609 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4610 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4611 if ((s[1] & 0xc0) != 0x80 ||
4612 (s[2] & 0xc0) != 0x80 ||
4613 ((unsigned char)s[0] == 0xE0 &&
4614 (unsigned char)s[1] < 0xA0) ||
4615 ((unsigned char)s[0] == 0xED &&
4616 (unsigned char)s[1] > 0x9F)) {
4617
4618 goto surrogateescape;
4619 }
4620 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4621 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004622 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004623 break;
4624
4625 case 4:
4626 if ((s[1] & 0xc0) != 0x80 ||
4627 (s[2] & 0xc0) != 0x80 ||
4628 (s[3] & 0xc0) != 0x80 ||
4629 ((unsigned char)s[0] == 0xF0 &&
4630 (unsigned char)s[1] < 0x90) ||
4631 ((unsigned char)s[0] == 0xF4 &&
4632 (unsigned char)s[1] > 0x8F)) {
4633 goto surrogateescape;
4634 }
4635 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4636 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4637 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4638
4639#if SIZEOF_WCHAR_T == 4
4640 *p++ = (wchar_t)ch;
4641#else
4642 /* compute and append the two surrogates: */
4643
4644 /* translate from 10000..10FFFF to 0..FFFF */
4645 ch -= 0x10000;
4646
4647 /* high surrogate = top 10 bits added to D800 */
4648 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4649
4650 /* low surrogate = bottom 10 bits added to DC00 */
4651 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4652#endif
4653 break;
4654 }
4655 s += n;
4656 continue;
4657
4658 surrogateescape:
4659 *p++ = 0xDC00 + ch;
4660 s++;
4661 }
4662 *p = L'\0';
4663 return unicode;
4664}
4665
4666#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004668/* Primary internal function which creates utf8 encoded bytes objects.
4669
4670 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004671 and allocate exactly as much space needed at the end. Else allocate the
4672 maximum possible needed (4 result bytes per Unicode character), and return
4673 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004674*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004675PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004676_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677{
Tim Peters602f7402002-04-27 18:03:26 +00004678#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004679
Guido van Rossum98297ee2007-11-06 21:34:58 +00004680 Py_ssize_t i; /* index into s of next input byte */
4681 PyObject *result; /* result string object */
4682 char *p; /* next free byte in output buffer */
4683 Py_ssize_t nallocated; /* number of result bytes allocated */
4684 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004685 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004686 PyObject *errorHandler = NULL;
4687 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004688 int kind;
4689 void *data;
4690 Py_ssize_t size;
4691 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4692#if SIZEOF_WCHAR_T == 2
4693 Py_ssize_t wchar_offset = 0;
4694#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004696 if (!PyUnicode_Check(unicode)) {
4697 PyErr_BadArgument();
4698 return NULL;
4699 }
4700
4701 if (PyUnicode_READY(unicode) == -1)
4702 return NULL;
4703
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004704 if (PyUnicode_UTF8(unicode))
4705 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4706 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004707
4708 kind = PyUnicode_KIND(unicode);
4709 data = PyUnicode_DATA(unicode);
4710 size = PyUnicode_GET_LENGTH(unicode);
4711
Tim Peters602f7402002-04-27 18:03:26 +00004712 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713
Tim Peters602f7402002-04-27 18:03:26 +00004714 if (size <= MAX_SHORT_UNICHARS) {
4715 /* Write into the stack buffer; nallocated can't overflow.
4716 * At the end, we'll allocate exactly as much heap space as it
4717 * turns out we need.
4718 */
4719 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004720 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004721 p = stackbuf;
4722 }
4723 else {
4724 /* Overallocate on the heap, and give the excess back at the end. */
4725 nallocated = size * 4;
4726 if (nallocated / 4 != size) /* overflow! */
4727 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004728 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004729 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004730 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004731 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004732 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004733
Tim Peters602f7402002-04-27 18:03:26 +00004734 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004735 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004736
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004737 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004738 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004740
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004742 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004743 *p++ = (char)(0xc0 | (ch >> 6));
4744 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004745 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004746 Py_ssize_t newpos;
4747 PyObject *rep;
4748 Py_ssize_t repsize, k, startpos;
4749 startpos = i-1;
4750#if SIZEOF_WCHAR_T == 2
4751 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004752#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004753 rep = unicode_encode_call_errorhandler(
4754 errors, &errorHandler, "utf-8", "surrogates not allowed",
4755 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4756 &exc, startpos, startpos+1, &newpos);
4757 if (!rep)
4758 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004760 if (PyBytes_Check(rep))
4761 repsize = PyBytes_GET_SIZE(rep);
4762 else
4763 repsize = PyUnicode_GET_SIZE(rep);
4764
4765 if (repsize > 4) {
4766 Py_ssize_t offset;
4767
4768 if (result == NULL)
4769 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004770 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004771 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004773 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4774 /* integer overflow */
4775 PyErr_NoMemory();
4776 goto error;
4777 }
4778 nallocated += repsize - 4;
4779 if (result != NULL) {
4780 if (_PyBytes_Resize(&result, nallocated) < 0)
4781 goto error;
4782 } else {
4783 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004784 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 goto error;
4786 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4787 }
4788 p = PyBytes_AS_STRING(result) + offset;
4789 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 if (PyBytes_Check(rep)) {
4792 char *prep = PyBytes_AS_STRING(rep);
4793 for(k = repsize; k > 0; k--)
4794 *p++ = *prep++;
4795 } else /* rep is unicode */ {
4796 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4797 Py_UNICODE c;
4798
4799 for(k=0; k<repsize; k++) {
4800 c = prep[k];
4801 if (0x80 <= c) {
4802 raise_encode_exception(&exc, "utf-8",
4803 PyUnicode_AS_UNICODE(unicode),
4804 size, i-1, i,
4805 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004806 goto error;
4807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004809 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004811 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004812 } else if (ch < 0x10000) {
4813 *p++ = (char)(0xe0 | (ch >> 12));
4814 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4815 *p++ = (char)(0x80 | (ch & 0x3f));
4816 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004817 /* Encode UCS4 Unicode ordinals */
4818 *p++ = (char)(0xf0 | (ch >> 18));
4819 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4820 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4821 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004822#if SIZEOF_WCHAR_T == 2
4823 wchar_offset++;
4824#endif
Tim Peters602f7402002-04-27 18:03:26 +00004825 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004827
Guido van Rossum98297ee2007-11-06 21:34:58 +00004828 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004829 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004830 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004831 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004832 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004833 }
4834 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004835 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004836 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004837 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004838 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004840
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004841 Py_XDECREF(errorHandler);
4842 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004843 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004844 error:
4845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
4847 Py_XDECREF(result);
4848 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004849
Tim Peters602f7402002-04-27 18:03:26 +00004850#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851}
4852
Alexander Belopolsky40018472011-02-26 01:02:56 +00004853PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004854PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4855 Py_ssize_t size,
4856 const char *errors)
4857{
4858 PyObject *v, *unicode;
4859
4860 unicode = PyUnicode_FromUnicode(s, size);
4861 if (unicode == NULL)
4862 return NULL;
4863 v = _PyUnicode_AsUTF8String(unicode, errors);
4864 Py_DECREF(unicode);
4865 return v;
4866}
4867
4868PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004869PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004871 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
Walter Dörwald41980ca2007-08-16 21:55:45 +00004874/* --- UTF-32 Codec ------------------------------------------------------- */
4875
4876PyObject *
4877PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004878 Py_ssize_t size,
4879 const char *errors,
4880 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004881{
4882 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4883}
4884
4885PyObject *
4886PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 Py_ssize_t size,
4888 const char *errors,
4889 int *byteorder,
4890 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004891{
4892 const char *starts = s;
4893 Py_ssize_t startinpos;
4894 Py_ssize_t endinpos;
4895 Py_ssize_t outpos;
4896 PyUnicodeObject *unicode;
4897 Py_UNICODE *p;
4898#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004899 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004900 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004901#else
4902 const int pairs = 0;
4903#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004904 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905 int bo = 0; /* assume native ordering by default */
4906 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907 /* Offsets from q for retrieving bytes in the right order. */
4908#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4909 int iorder[] = {0, 1, 2, 3};
4910#else
4911 int iorder[] = {3, 2, 1, 0};
4912#endif
4913 PyObject *errorHandler = NULL;
4914 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004915
Walter Dörwald41980ca2007-08-16 21:55:45 +00004916 q = (unsigned char *)s;
4917 e = q + size;
4918
4919 if (byteorder)
4920 bo = *byteorder;
4921
4922 /* Check for BOM marks (U+FEFF) in the input and adjust current
4923 byte order setting accordingly. In native mode, the leading BOM
4924 mark is skipped, in all other modes, it is copied to the output
4925 stream as-is (giving a ZWNBSP character). */
4926 if (bo == 0) {
4927 if (size >= 4) {
4928 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004930#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004931 if (bom == 0x0000FEFF) {
4932 q += 4;
4933 bo = -1;
4934 }
4935 else if (bom == 0xFFFE0000) {
4936 q += 4;
4937 bo = 1;
4938 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 if (bom == 0x0000FEFF) {
4941 q += 4;
4942 bo = 1;
4943 }
4944 else if (bom == 0xFFFE0000) {
4945 q += 4;
4946 bo = -1;
4947 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 }
4951
4952 if (bo == -1) {
4953 /* force LE */
4954 iorder[0] = 0;
4955 iorder[1] = 1;
4956 iorder[2] = 2;
4957 iorder[3] = 3;
4958 }
4959 else if (bo == 1) {
4960 /* force BE */
4961 iorder[0] = 3;
4962 iorder[1] = 2;
4963 iorder[2] = 1;
4964 iorder[3] = 0;
4965 }
4966
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004967 /* On narrow builds we split characters outside the BMP into two
4968 codepoints => count how much extra space we need. */
4969#ifndef Py_UNICODE_WIDE
4970 for (qq = q; qq < e; qq += 4)
4971 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4972 pairs++;
4973#endif
4974
4975 /* This might be one to much, because of a BOM */
4976 unicode = _PyUnicode_New((size+3)/4+pairs);
4977 if (!unicode)
4978 return NULL;
4979 if (size == 0)
4980 return (PyObject *)unicode;
4981
4982 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004983 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004984
Walter Dörwald41980ca2007-08-16 21:55:45 +00004985 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004986 Py_UCS4 ch;
4987 /* remaining bytes at the end? (size should be divisible by 4) */
4988 if (e-q<4) {
4989 if (consumed)
4990 break;
4991 errmsg = "truncated data";
4992 startinpos = ((const char *)q)-starts;
4993 endinpos = ((const char *)e)-starts;
4994 goto utf32Error;
4995 /* The remaining input chars are ignored if the callback
4996 chooses to skip the input */
4997 }
4998 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4999 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005000
Benjamin Peterson29060642009-01-31 22:14:21 +00005001 if (ch >= 0x110000)
5002 {
5003 errmsg = "codepoint not in range(0x110000)";
5004 startinpos = ((const char *)q)-starts;
5005 endinpos = startinpos+4;
5006 goto utf32Error;
5007 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005008#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 if (ch >= 0x10000)
5010 {
5011 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5012 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5013 }
5014 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 *p++ = ch;
5017 q += 4;
5018 continue;
5019 utf32Error:
5020 outpos = p-PyUnicode_AS_UNICODE(unicode);
5021 if (unicode_decode_call_errorhandler(
5022 errors, &errorHandler,
5023 "utf32", errmsg,
5024 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5025 &unicode, &outpos, &p))
5026 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005027 }
5028
5029 if (byteorder)
5030 *byteorder = bo;
5031
5032 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005033 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034
5035 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005036 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037 goto onError;
5038
5039 Py_XDECREF(errorHandler);
5040 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005041#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005042 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005043 Py_DECREF(unicode);
5044 return NULL;
5045 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005046#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005047 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048 return (PyObject *)unicode;
5049
Benjamin Peterson29060642009-01-31 22:14:21 +00005050 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005051 Py_DECREF(unicode);
5052 Py_XDECREF(errorHandler);
5053 Py_XDECREF(exc);
5054 return NULL;
5055}
5056
5057PyObject *
5058PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 Py_ssize_t size,
5060 const char *errors,
5061 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005062{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005063 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005064 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005065 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005067 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068#else
5069 const int pairs = 0;
5070#endif
5071 /* Offsets from p for storing byte pairs in the right order. */
5072#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5073 int iorder[] = {0, 1, 2, 3};
5074#else
5075 int iorder[] = {3, 2, 1, 0};
5076#endif
5077
Benjamin Peterson29060642009-01-31 22:14:21 +00005078#define STORECHAR(CH) \
5079 do { \
5080 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5081 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5082 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5083 p[iorder[0]] = (CH) & 0xff; \
5084 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005085 } while(0)
5086
5087 /* In narrow builds we can output surrogate pairs as one codepoint,
5088 so we need less space. */
5089#ifndef Py_UNICODE_WIDE
5090 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005091 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5092 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5093 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005095 nsize = (size - pairs + (byteorder == 0));
5096 bytesize = nsize * 4;
5097 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005098 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005099 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100 if (v == NULL)
5101 return NULL;
5102
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005103 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005105 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005107 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108
5109 if (byteorder == -1) {
5110 /* force LE */
5111 iorder[0] = 0;
5112 iorder[1] = 1;
5113 iorder[2] = 2;
5114 iorder[3] = 3;
5115 }
5116 else if (byteorder == 1) {
5117 /* force BE */
5118 iorder[0] = 3;
5119 iorder[1] = 2;
5120 iorder[2] = 1;
5121 iorder[3] = 0;
5122 }
5123
5124 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005126#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005127 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5128 Py_UCS4 ch2 = *s;
5129 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5130 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5131 s++;
5132 size--;
5133 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005134 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135#endif
5136 STORECHAR(ch);
5137 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005138
5139 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005140 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141#undef STORECHAR
5142}
5143
Alexander Belopolsky40018472011-02-26 01:02:56 +00005144PyObject *
5145PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146{
5147 if (!PyUnicode_Check(unicode)) {
5148 PyErr_BadArgument();
5149 return NULL;
5150 }
5151 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005152 PyUnicode_GET_SIZE(unicode),
5153 NULL,
5154 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155}
5156
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157/* --- UTF-16 Codec ------------------------------------------------------- */
5158
Tim Peters772747b2001-08-09 22:21:55 +00005159PyObject *
5160PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 Py_ssize_t size,
5162 const char *errors,
5163 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164{
Walter Dörwald69652032004-09-07 20:24:22 +00005165 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5166}
5167
Antoine Pitrouab868312009-01-10 15:40:25 +00005168/* Two masks for fast checking of whether a C 'long' may contain
5169 UTF16-encoded surrogate characters. This is an efficient heuristic,
5170 assuming that non-surrogate characters with a code point >= 0x8000 are
5171 rare in most input.
5172 FAST_CHAR_MASK is used when the input is in native byte ordering,
5173 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005174*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005175#if (SIZEOF_LONG == 8)
5176# define FAST_CHAR_MASK 0x8000800080008000L
5177# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5178#elif (SIZEOF_LONG == 4)
5179# define FAST_CHAR_MASK 0x80008000L
5180# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5181#else
5182# error C 'long' size should be either 4 or 8!
5183#endif
5184
Walter Dörwald69652032004-09-07 20:24:22 +00005185PyObject *
5186PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005187 Py_ssize_t size,
5188 const char *errors,
5189 int *byteorder,
5190 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005191{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005192 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005193 Py_ssize_t startinpos;
5194 Py_ssize_t endinpos;
5195 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196 PyUnicodeObject *unicode;
5197 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005198 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005199 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005200 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005201 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005202 /* Offsets from q for retrieving byte pairs in the right order. */
5203#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5204 int ihi = 1, ilo = 0;
5205#else
5206 int ihi = 0, ilo = 1;
5207#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005208 PyObject *errorHandler = NULL;
5209 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210
5211 /* Note: size will always be longer than the resulting Unicode
5212 character count */
5213 unicode = _PyUnicode_New(size);
5214 if (!unicode)
5215 return NULL;
5216 if (size == 0)
5217 return (PyObject *)unicode;
5218
5219 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005220 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005221 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005222 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
5224 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005225 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005227 /* Check for BOM marks (U+FEFF) in the input and adjust current
5228 byte order setting accordingly. In native mode, the leading BOM
5229 mark is skipped, in all other modes, it is copied to the output
5230 stream as-is (giving a ZWNBSP character). */
5231 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005232 if (size >= 2) {
5233 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005234#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005235 if (bom == 0xFEFF) {
5236 q += 2;
5237 bo = -1;
5238 }
5239 else if (bom == 0xFFFE) {
5240 q += 2;
5241 bo = 1;
5242 }
Tim Petersced69f82003-09-16 20:30:58 +00005243#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 if (bom == 0xFEFF) {
5245 q += 2;
5246 bo = 1;
5247 }
5248 else if (bom == 0xFFFE) {
5249 q += 2;
5250 bo = -1;
5251 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005252#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Tim Peters772747b2001-08-09 22:21:55 +00005256 if (bo == -1) {
5257 /* force LE */
5258 ihi = 1;
5259 ilo = 0;
5260 }
5261 else if (bo == 1) {
5262 /* force BE */
5263 ihi = 0;
5264 ilo = 1;
5265 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005266#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5267 native_ordering = ilo < ihi;
5268#else
5269 native_ordering = ilo > ihi;
5270#endif
Tim Peters772747b2001-08-09 22:21:55 +00005271
Antoine Pitrouab868312009-01-10 15:40:25 +00005272 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005273 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005274 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005275 /* First check for possible aligned read of a C 'long'. Unaligned
5276 reads are more expensive, better to defer to another iteration. */
5277 if (!((size_t) q & LONG_PTR_MASK)) {
5278 /* Fast path for runs of non-surrogate chars. */
5279 register const unsigned char *_q = q;
5280 Py_UNICODE *_p = p;
5281 if (native_ordering) {
5282 /* Native ordering is simple: as long as the input cannot
5283 possibly contain a surrogate char, do an unrolled copy
5284 of several 16-bit code points to the target object.
5285 The non-surrogate check is done on several input bytes
5286 at a time (as many as a C 'long' can contain). */
5287 while (_q < aligned_end) {
5288 unsigned long data = * (unsigned long *) _q;
5289 if (data & FAST_CHAR_MASK)
5290 break;
5291 _p[0] = ((unsigned short *) _q)[0];
5292 _p[1] = ((unsigned short *) _q)[1];
5293#if (SIZEOF_LONG == 8)
5294 _p[2] = ((unsigned short *) _q)[2];
5295 _p[3] = ((unsigned short *) _q)[3];
5296#endif
5297 _q += SIZEOF_LONG;
5298 _p += SIZEOF_LONG / 2;
5299 }
5300 }
5301 else {
5302 /* Byteswapped ordering is similar, but we must decompose
5303 the copy bytewise, and take care of zero'ing out the
5304 upper bytes if the target object is in 32-bit units
5305 (that is, in UCS-4 builds). */
5306 while (_q < aligned_end) {
5307 unsigned long data = * (unsigned long *) _q;
5308 if (data & SWAPPED_FAST_CHAR_MASK)
5309 break;
5310 /* Zero upper bytes in UCS-4 builds */
5311#if (Py_UNICODE_SIZE > 2)
5312 _p[0] = 0;
5313 _p[1] = 0;
5314#if (SIZEOF_LONG == 8)
5315 _p[2] = 0;
5316 _p[3] = 0;
5317#endif
5318#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005319 /* Issue #4916; UCS-4 builds on big endian machines must
5320 fill the two last bytes of each 4-byte unit. */
5321#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5322# define OFF 2
5323#else
5324# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005325#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005326 ((unsigned char *) _p)[OFF + 1] = _q[0];
5327 ((unsigned char *) _p)[OFF + 0] = _q[1];
5328 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5329 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5330#if (SIZEOF_LONG == 8)
5331 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5332 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5333 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5334 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5335#endif
5336#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005337 _q += SIZEOF_LONG;
5338 _p += SIZEOF_LONG / 2;
5339 }
5340 }
5341 p = _p;
5342 q = _q;
5343 if (q >= e)
5344 break;
5345 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005346 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005347
Benjamin Peterson14339b62009-01-31 16:36:08 +00005348 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005349
5350 if (ch < 0xD800 || ch > 0xDFFF) {
5351 *p++ = ch;
5352 continue;
5353 }
5354
5355 /* UTF-16 code pair: */
5356 if (q > e) {
5357 errmsg = "unexpected end of data";
5358 startinpos = (((const char *)q) - 2) - starts;
5359 endinpos = ((const char *)e) + 1 - starts;
5360 goto utf16Error;
5361 }
5362 if (0xD800 <= ch && ch <= 0xDBFF) {
5363 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5364 q += 2;
5365 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005366#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 *p++ = ch;
5368 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005369#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005370 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005371#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 continue;
5373 }
5374 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005375 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 startinpos = (((const char *)q)-4)-starts;
5377 endinpos = startinpos+2;
5378 goto utf16Error;
5379 }
5380
Benjamin Peterson14339b62009-01-31 16:36:08 +00005381 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 errmsg = "illegal encoding";
5383 startinpos = (((const char *)q)-2)-starts;
5384 endinpos = startinpos+2;
5385 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005386
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 utf16Error:
5388 outpos = p - PyUnicode_AS_UNICODE(unicode);
5389 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005390 errors,
5391 &errorHandler,
5392 "utf16", errmsg,
5393 &starts,
5394 (const char **)&e,
5395 &startinpos,
5396 &endinpos,
5397 &exc,
5398 (const char **)&q,
5399 &unicode,
5400 &outpos,
5401 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005402 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005404 /* remaining byte at the end? (size should be even) */
5405 if (e == q) {
5406 if (!consumed) {
5407 errmsg = "truncated data";
5408 startinpos = ((const char *)q) - starts;
5409 endinpos = ((const char *)e) + 1 - starts;
5410 outpos = p - PyUnicode_AS_UNICODE(unicode);
5411 if (unicode_decode_call_errorhandler(
5412 errors,
5413 &errorHandler,
5414 "utf16", errmsg,
5415 &starts,
5416 (const char **)&e,
5417 &startinpos,
5418 &endinpos,
5419 &exc,
5420 (const char **)&q,
5421 &unicode,
5422 &outpos,
5423 &p))
5424 goto onError;
5425 /* The remaining input chars are ignored if the callback
5426 chooses to skip the input */
5427 }
5428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
5430 if (byteorder)
5431 *byteorder = bo;
5432
Walter Dörwald69652032004-09-07 20:24:22 +00005433 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005434 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005435
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005437 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438 goto onError;
5439
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005440 Py_XDECREF(errorHandler);
5441 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005442#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005443 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005444 Py_DECREF(unicode);
5445 return NULL;
5446 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005447#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005448 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 return (PyObject *)unicode;
5450
Benjamin Peterson29060642009-01-31 22:14:21 +00005451 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005453 Py_XDECREF(errorHandler);
5454 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return NULL;
5456}
5457
Antoine Pitrouab868312009-01-10 15:40:25 +00005458#undef FAST_CHAR_MASK
5459#undef SWAPPED_FAST_CHAR_MASK
5460
Tim Peters772747b2001-08-09 22:21:55 +00005461PyObject *
5462PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 Py_ssize_t size,
5464 const char *errors,
5465 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005467 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005468 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005469 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005470#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005471 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005472#else
5473 const int pairs = 0;
5474#endif
Tim Peters772747b2001-08-09 22:21:55 +00005475 /* Offsets from p for storing byte pairs in the right order. */
5476#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5477 int ihi = 1, ilo = 0;
5478#else
5479 int ihi = 0, ilo = 1;
5480#endif
5481
Benjamin Peterson29060642009-01-31 22:14:21 +00005482#define STORECHAR(CH) \
5483 do { \
5484 p[ihi] = ((CH) >> 8) & 0xff; \
5485 p[ilo] = (CH) & 0xff; \
5486 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005487 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005489#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005490 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005491 if (s[i] >= 0x10000)
5492 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005493#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005494 /* 2 * (size + pairs + (byteorder == 0)) */
5495 if (size > PY_SSIZE_T_MAX ||
5496 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005498 nsize = size + pairs + (byteorder == 0);
5499 bytesize = nsize * 2;
5500 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005502 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 if (v == NULL)
5504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005505
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005506 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005508 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005509 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005510 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005511
5512 if (byteorder == -1) {
5513 /* force LE */
5514 ihi = 1;
5515 ilo = 0;
5516 }
5517 else if (byteorder == 1) {
5518 /* force BE */
5519 ihi = 0;
5520 ilo = 1;
5521 }
5522
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005523 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 Py_UNICODE ch = *s++;
5525 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005526#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005527 if (ch >= 0x10000) {
5528 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5529 ch = 0xD800 | ((ch-0x10000) >> 10);
5530 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005531#endif
Tim Peters772747b2001-08-09 22:21:55 +00005532 STORECHAR(ch);
5533 if (ch2)
5534 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005535 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005536
5537 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005538 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005539#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540}
5541
Alexander Belopolsky40018472011-02-26 01:02:56 +00005542PyObject *
5543PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544{
5545 if (!PyUnicode_Check(unicode)) {
5546 PyErr_BadArgument();
5547 return NULL;
5548 }
5549 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005550 PyUnicode_GET_SIZE(unicode),
5551 NULL,
5552 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553}
5554
5555/* --- Unicode Escape Codec ----------------------------------------------- */
5556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005557/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5558 if all the escapes in the string make it still a valid ASCII string.
5559 Returns -1 if any escapes were found which cause the string to
5560 pop out of ASCII range. Otherwise returns the length of the
5561 required buffer to hold the string.
5562 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005563static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005564length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5565{
5566 const unsigned char *p = (const unsigned char *)s;
5567 const unsigned char *end = p + size;
5568 Py_ssize_t length = 0;
5569
5570 if (size < 0)
5571 return -1;
5572
5573 for (; p < end; ++p) {
5574 if (*p > 127) {
5575 /* Non-ASCII */
5576 return -1;
5577 }
5578 else if (*p != '\\') {
5579 /* Normal character */
5580 ++length;
5581 }
5582 else {
5583 /* Backslash-escape, check next char */
5584 ++p;
5585 /* Escape sequence reaches till end of string or
5586 non-ASCII follow-up. */
5587 if (p >= end || *p > 127)
5588 return -1;
5589 switch (*p) {
5590 case '\n':
5591 /* backslash + \n result in zero characters */
5592 break;
5593 case '\\': case '\'': case '\"':
5594 case 'b': case 'f': case 't':
5595 case 'n': case 'r': case 'v': case 'a':
5596 ++length;
5597 break;
5598 case '0': case '1': case '2': case '3':
5599 case '4': case '5': case '6': case '7':
5600 case 'x': case 'u': case 'U': case 'N':
5601 /* these do not guarantee ASCII characters */
5602 return -1;
5603 default:
5604 /* count the backslash + the other character */
5605 length += 2;
5606 }
5607 }
5608 }
5609 return length;
5610}
5611
5612/* Similar to PyUnicode_WRITE but either write into wstr field
5613 or treat string as ASCII. */
5614#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5615 do { \
5616 if ((kind) != PyUnicode_WCHAR_KIND) \
5617 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5618 else \
5619 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5620 } while (0)
5621
5622#define WRITE_WSTR(buf, index, value) \
5623 assert(kind == PyUnicode_WCHAR_KIND), \
5624 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5625
5626
Fredrik Lundh06d12682001-01-24 07:59:11 +00005627static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005628
Alexander Belopolsky40018472011-02-26 01:02:56 +00005629PyObject *
5630PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005631 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005632 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005634 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005635 Py_ssize_t startinpos;
5636 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005637 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005641 char* message;
5642 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 PyObject *errorHandler = NULL;
5644 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005645 Py_ssize_t ascii_length;
5646 Py_ssize_t i;
5647 int kind;
5648 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 ascii_length = length_of_escaped_ascii_string(s, size);
5651
5652 /* After length_of_escaped_ascii_string() there are two alternatives,
5653 either the string is pure ASCII with named escapes like \n, etc.
5654 and we determined it's exact size (common case)
5655 or it contains \x, \u, ... escape sequences. then we create a
5656 legacy wchar string and resize it at the end of this function. */
5657 if (ascii_length >= 0) {
5658 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5659 if (!v)
5660 goto onError;
5661 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5662 kind = PyUnicode_1BYTE_KIND;
5663 data = PyUnicode_DATA(v);
5664 }
5665 else {
5666 /* Escaped strings will always be longer than the resulting
5667 Unicode string, so we start with size here and then reduce the
5668 length after conversion to the true value.
5669 (but if the error callback returns a long replacement string
5670 we'll have to allocate more space) */
5671 v = _PyUnicode_New(size);
5672 if (!v)
5673 goto onError;
5674 kind = PyUnicode_WCHAR_KIND;
5675 data = PyUnicode_AS_UNICODE(v);
5676 }
5677
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 if (size == 0)
5679 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005680 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 while (s < end) {
5684 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005685 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005686 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 if (kind == PyUnicode_WCHAR_KIND) {
5689 assert(i < _PyUnicode_WSTR_LENGTH(v));
5690 }
5691 else {
5692 /* The only case in which i == ascii_length is a backslash
5693 followed by a newline. */
5694 assert(i <= ascii_length);
5695 }
5696
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 /* Non-escape characters are interpreted as Unicode ordinals */
5698 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 continue;
5701 }
5702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005703 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 /* \ - Escapes */
5705 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005706 c = *s++;
5707 if (s > end)
5708 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709
5710 if (kind == PyUnicode_WCHAR_KIND) {
5711 assert(i < _PyUnicode_WSTR_LENGTH(v));
5712 }
5713 else {
5714 /* The only case in which i == ascii_length is a backslash
5715 followed by a newline. */
5716 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5717 }
5718
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005719 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720
Benjamin Peterson29060642009-01-31 22:14:21 +00005721 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005723 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5724 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5725 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5726 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5727 /* FF */
5728 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5729 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5730 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5731 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5732 /* VT */
5733 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5734 /* BEL, not classic C */
5735 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
Benjamin Peterson29060642009-01-31 22:14:21 +00005737 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 case '0': case '1': case '2': case '3':
5739 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005740 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005741 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005742 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005743 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005744 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005746 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 break;
5748
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 /* hex escapes */
5750 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005752 digits = 2;
5753 message = "truncated \\xXX escape";
5754 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755
Benjamin Peterson29060642009-01-31 22:14:21 +00005756 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758 digits = 4;
5759 message = "truncated \\uXXXX escape";
5760 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005763 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 digits = 8;
5765 message = "truncated \\UXXXXXXXX escape";
5766 hexescape:
5767 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005768 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005769 if (s+digits>end) {
5770 endinpos = size;
5771 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 errors, &errorHandler,
5773 "unicodeescape", "end of string in escape sequence",
5774 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005775 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 goto nextByte;
5779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 for (j = 0; j < digits; ++j) {
5781 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005782 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 endinpos = (s+j+1)-starts;
5784 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 errors, &errorHandler,
5787 "unicodeescape", message,
5788 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005789 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005790 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005792 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005793 }
5794 chr = (chr<<4) & ~0xF;
5795 if (c >= '0' && c <= '9')
5796 chr += c - '0';
5797 else if (c >= 'a' && c <= 'f')
5798 chr += 10 + c - 'a';
5799 else
5800 chr += 10 + c - 'A';
5801 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005802 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005803 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804 /* _decoding_error will have already written into the
5805 target buffer. */
5806 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005808 /* when we get here, chr is a 32-bit unicode character */
5809 if (chr <= 0xffff)
5810 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005811 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005813 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005815#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005817#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005818 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005819 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5820 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005821#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005822 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005823 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005825 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 errors, &errorHandler,
5827 "unicodeescape", "illegal Unicode character",
5828 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005829 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005830 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005831 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005832 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 break;
5834
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005836 case 'N':
5837 message = "malformed \\N character escape";
5838 if (ucnhash_CAPI == NULL) {
5839 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005840 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5841 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 if (ucnhash_CAPI == NULL)
5843 goto ucnhashError;
5844 }
5845 if (*s == '{') {
5846 const char *start = s+1;
5847 /* look for the closing brace */
5848 while (*s != '}' && s < end)
5849 s++;
5850 if (s > start && s < end && *s == '}') {
5851 /* found a name. look it up in the unicode database */
5852 message = "unknown Unicode character name";
5853 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005854 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5855 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005856 goto store;
5857 }
5858 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005862 errors, &errorHandler,
5863 "unicodeescape", message,
5864 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005866 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005867 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005868 break;
5869
5870 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005871 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005872 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005873 message = "\\ at end of string";
5874 s--;
5875 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 errors, &errorHandler,
5879 "unicodeescape", message,
5880 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005882 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005883 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005884 }
5885 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005886 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5887 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005889 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005891 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005892 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005894 /* Ensure the length prediction worked in case of ASCII strings */
5895 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5896
Victor Stinnerfe226c02011-10-03 03:52:20 +02005897 if (kind == PyUnicode_WCHAR_KIND)
5898 {
5899 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5900 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005901 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005902 Py_XDECREF(errorHandler);
5903 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005904#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005905 if (_PyUnicode_READY_REPLACE(&v)) {
5906 Py_DECREF(v);
5907 return NULL;
5908 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005909#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005910 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005911 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005912
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005914 PyErr_SetString(
5915 PyExc_UnicodeError,
5916 "\\N escapes not supported (can't load unicodedata module)"
5917 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005918 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 Py_XDECREF(errorHandler);
5920 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005921 return NULL;
5922
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 Py_XDECREF(errorHandler);
5926 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 return NULL;
5928}
5929
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005930#undef WRITE_ASCII_OR_WSTR
5931#undef WRITE_WSTR
5932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933/* Return a Unicode-Escape string version of the Unicode object.
5934
5935 If quotes is true, the string is enclosed in u"" or u'' quotes as
5936 appropriate.
5937
5938*/
5939
Alexander Belopolsky40018472011-02-26 01:02:56 +00005940PyObject *
5941PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005942 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005944 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005947#ifdef Py_UNICODE_WIDE
5948 const Py_ssize_t expandsize = 10;
5949#else
5950 const Py_ssize_t expandsize = 6;
5951#endif
5952
Thomas Wouters89f507f2006-12-13 04:49:30 +00005953 /* XXX(nnorwitz): rather than over-allocating, it would be
5954 better to choose a different scheme. Perhaps scan the
5955 first N-chars of the string and allocate based on that size.
5956 */
5957 /* Initial allocation is based on the longest-possible unichr
5958 escape.
5959
5960 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5961 unichr, so in this case it's the longest unichr escape. In
5962 narrow (UTF-16) builds this is five chars per source unichr
5963 since there are two unichrs in the surrogate pair, so in narrow
5964 (UTF-16) builds it's not the longest unichr escape.
5965
5966 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5967 so in the narrow (UTF-16) build case it's the longest unichr
5968 escape.
5969 */
5970
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005971 if (size == 0)
5972 return PyBytes_FromStringAndSize(NULL, 0);
5973
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005974 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005976
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005977 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005978 2
5979 + expandsize*size
5980 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 if (repr == NULL)
5982 return NULL;
5983
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005984 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 while (size-- > 0) {
5987 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005988
Walter Dörwald79e913e2007-05-12 11:08:06 +00005989 /* Escape backslashes */
5990 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 *p++ = '\\';
5992 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005993 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005994 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005995
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005996#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005997 /* Map 21-bit characters to '\U00xxxxxx' */
5998 else if (ch >= 0x10000) {
5999 *p++ = '\\';
6000 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006001 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6002 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6003 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6004 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6005 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6006 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6007 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6008 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006009 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006010 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006011#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006012 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6013 else if (ch >= 0xD800 && ch < 0xDC00) {
6014 Py_UNICODE ch2;
6015 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006016
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 ch2 = *s++;
6018 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006019 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6021 *p++ = '\\';
6022 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006023 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6024 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6025 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6026 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6027 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6028 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6029 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6030 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 continue;
6032 }
6033 /* Fall through: isolated surrogates are copied as-is */
6034 s--;
6035 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006036 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006037#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006038
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006040 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 *p++ = '\\';
6042 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006043 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6044 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6045 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6046 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006048
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006049 /* Map special whitespace to '\t', \n', '\r' */
6050 else if (ch == '\t') {
6051 *p++ = '\\';
6052 *p++ = 't';
6053 }
6054 else if (ch == '\n') {
6055 *p++ = '\\';
6056 *p++ = 'n';
6057 }
6058 else if (ch == '\r') {
6059 *p++ = '\\';
6060 *p++ = 'r';
6061 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006062
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006063 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006064 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006066 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006067 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6068 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006069 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006070
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 /* Copy everything else as-is */
6072 else
6073 *p++ = (char) ch;
6074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006076 assert(p - PyBytes_AS_STRING(repr) > 0);
6077 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6078 return NULL;
6079 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080}
6081
Alexander Belopolsky40018472011-02-26 01:02:56 +00006082PyObject *
6083PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006085 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 if (!PyUnicode_Check(unicode)) {
6087 PyErr_BadArgument();
6088 return NULL;
6089 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006090 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6091 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006092 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093}
6094
6095/* --- Raw Unicode Escape Codec ------------------------------------------- */
6096
Alexander Belopolsky40018472011-02-26 01:02:56 +00006097PyObject *
6098PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006099 Py_ssize_t size,
6100 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006102 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006103 Py_ssize_t startinpos;
6104 Py_ssize_t endinpos;
6105 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006108 const char *end;
6109 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006110 PyObject *errorHandler = NULL;
6111 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006112
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 /* Escaped strings will always be longer than the resulting
6114 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 length after conversion to the true value. (But decoding error
6116 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 v = _PyUnicode_New(size);
6118 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006119 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006121 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 end = s + size;
6124 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 unsigned char c;
6126 Py_UCS4 x;
6127 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006128 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 /* Non-escape characters are interpreted as Unicode ordinals */
6131 if (*s != '\\') {
6132 *p++ = (unsigned char)*s++;
6133 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 startinpos = s-starts;
6136
6137 /* \u-escapes are only interpreted iff the number of leading
6138 backslashes if odd */
6139 bs = s;
6140 for (;s < end;) {
6141 if (*s != '\\')
6142 break;
6143 *p++ = (unsigned char)*s++;
6144 }
6145 if (((s - bs) & 1) == 0 ||
6146 s >= end ||
6147 (*s != 'u' && *s != 'U')) {
6148 continue;
6149 }
6150 p--;
6151 count = *s=='u' ? 4 : 8;
6152 s++;
6153
6154 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6155 outpos = p-PyUnicode_AS_UNICODE(v);
6156 for (x = 0, i = 0; i < count; ++i, ++s) {
6157 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006158 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006159 endinpos = s-starts;
6160 if (unicode_decode_call_errorhandler(
6161 errors, &errorHandler,
6162 "rawunicodeescape", "truncated \\uXXXX",
6163 &starts, &end, &startinpos, &endinpos, &exc, &s,
6164 &v, &outpos, &p))
6165 goto onError;
6166 goto nextByte;
6167 }
6168 x = (x<<4) & ~0xF;
6169 if (c >= '0' && c <= '9')
6170 x += c - '0';
6171 else if (c >= 'a' && c <= 'f')
6172 x += 10 + c - 'a';
6173 else
6174 x += 10 + c - 'A';
6175 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006176 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 /* UCS-2 character */
6178 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006179 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006180 /* UCS-4 character. Either store directly, or as
6181 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006182#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006183 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006184#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 x -= 0x10000L;
6186 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6187 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006188#endif
6189 } else {
6190 endinpos = s-starts;
6191 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006192 if (unicode_decode_call_errorhandler(
6193 errors, &errorHandler,
6194 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006195 &starts, &end, &startinpos, &endinpos, &exc, &s,
6196 &v, &outpos, &p))
6197 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006198 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 nextByte:
6200 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006201 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006202 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006204 Py_XDECREF(errorHandler);
6205 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006206#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006207 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006208 Py_DECREF(v);
6209 return NULL;
6210 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006211#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006212 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006214
Benjamin Peterson29060642009-01-31 22:14:21 +00006215 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006217 Py_XDECREF(errorHandler);
6218 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006219 return NULL;
6220}
6221
Alexander Belopolsky40018472011-02-26 01:02:56 +00006222PyObject *
6223PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006224 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006226 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 char *p;
6228 char *q;
6229
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006230#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006231 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006232#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006233 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006234#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006235
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006236 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006238
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006239 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 if (repr == NULL)
6241 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006242 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006243 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006245 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 while (size-- > 0) {
6247 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006248#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 /* Map 32-bit characters to '\Uxxxxxxxx' */
6250 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006251 *p++ = '\\';
6252 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006253 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6254 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6255 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6256 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6257 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6258 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6259 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6260 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006261 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006262 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006263#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006264 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6265 if (ch >= 0xD800 && ch < 0xDC00) {
6266 Py_UNICODE ch2;
6267 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006268
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 ch2 = *s++;
6270 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006271 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6273 *p++ = '\\';
6274 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006275 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6276 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6277 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6278 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6279 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6280 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6281 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6282 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006283 continue;
6284 }
6285 /* Fall through: isolated surrogates are copied as-is */
6286 s--;
6287 size++;
6288 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006289#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006290 /* Map 16-bit characters to '\uxxxx' */
6291 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006292 *p++ = '\\';
6293 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006294 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6295 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6296 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6297 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 /* Copy everything else as-is */
6300 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 *p++ = (char) ch;
6302 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006303 size = p - q;
6304
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006305 assert(size > 0);
6306 if (_PyBytes_Resize(&repr, size) < 0)
6307 return NULL;
6308 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309}
6310
Alexander Belopolsky40018472011-02-26 01:02:56 +00006311PyObject *
6312PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006314 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006315 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006316 PyErr_BadArgument();
6317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006319 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6320 PyUnicode_GET_SIZE(unicode));
6321
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006322 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323}
6324
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006325/* --- Unicode Internal Codec ------------------------------------------- */
6326
Alexander Belopolsky40018472011-02-26 01:02:56 +00006327PyObject *
6328_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006329 Py_ssize_t size,
6330 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006331{
6332 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006333 Py_ssize_t startinpos;
6334 Py_ssize_t endinpos;
6335 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006336 PyUnicodeObject *v;
6337 Py_UNICODE *p;
6338 const char *end;
6339 const char *reason;
6340 PyObject *errorHandler = NULL;
6341 PyObject *exc = NULL;
6342
Neal Norwitzd43069c2006-01-08 01:12:10 +00006343#ifdef Py_UNICODE_WIDE
6344 Py_UNICODE unimax = PyUnicode_GetMax();
6345#endif
6346
Thomas Wouters89f507f2006-12-13 04:49:30 +00006347 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006348 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6349 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006350 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006351 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6352 as string was created with the old API. */
6353 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006355 p = PyUnicode_AS_UNICODE(v);
6356 end = s + size;
6357
6358 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006359 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006360 /* We have to sanity check the raw data, otherwise doom looms for
6361 some malformed UCS-4 data. */
6362 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006363#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006364 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006365#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006366 end-s < Py_UNICODE_SIZE
6367 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006368 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006369 startinpos = s - starts;
6370 if (end-s < Py_UNICODE_SIZE) {
6371 endinpos = end-starts;
6372 reason = "truncated input";
6373 }
6374 else {
6375 endinpos = s - starts + Py_UNICODE_SIZE;
6376 reason = "illegal code point (> 0x10FFFF)";
6377 }
6378 outpos = p - PyUnicode_AS_UNICODE(v);
6379 if (unicode_decode_call_errorhandler(
6380 errors, &errorHandler,
6381 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006382 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006383 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006384 goto onError;
6385 }
6386 }
6387 else {
6388 p++;
6389 s += Py_UNICODE_SIZE;
6390 }
6391 }
6392
Victor Stinnerfe226c02011-10-03 03:52:20 +02006393 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006394 goto onError;
6395 Py_XDECREF(errorHandler);
6396 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006397#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006398 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006399 Py_DECREF(v);
6400 return NULL;
6401 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006402#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006403 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006404 return (PyObject *)v;
6405
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006407 Py_XDECREF(v);
6408 Py_XDECREF(errorHandler);
6409 Py_XDECREF(exc);
6410 return NULL;
6411}
6412
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413/* --- Latin-1 Codec ------------------------------------------------------ */
6414
Alexander Belopolsky40018472011-02-26 01:02:56 +00006415PyObject *
6416PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006417 Py_ssize_t size,
6418 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006421 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422}
6423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006424/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006425static void
6426make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006427 const char *encoding,
6428 const Py_UNICODE *unicode, Py_ssize_t size,
6429 Py_ssize_t startpos, Py_ssize_t endpos,
6430 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006433 *exceptionObject = PyUnicodeEncodeError_Create(
6434 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435 }
6436 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6438 goto onError;
6439 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6440 goto onError;
6441 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6442 goto onError;
6443 return;
6444 onError:
6445 Py_DECREF(*exceptionObject);
6446 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 }
6448}
6449
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006450/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006451static void
6452raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006453 const char *encoding,
6454 const Py_UNICODE *unicode, Py_ssize_t size,
6455 Py_ssize_t startpos, Py_ssize_t endpos,
6456 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006457{
6458 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006461 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462}
6463
6464/* error handling callback helper:
6465 build arguments, call the callback and check the arguments,
6466 put the result into newpos and return the replacement string, which
6467 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006468static PyObject *
6469unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006470 PyObject **errorHandler,
6471 const char *encoding, const char *reason,
6472 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6473 Py_ssize_t startpos, Py_ssize_t endpos,
6474 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006475{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006476 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006477
6478 PyObject *restuple;
6479 PyObject *resunicode;
6480
6481 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006484 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485 }
6486
6487 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006489 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491
6492 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006497 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 Py_DECREF(restuple);
6499 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006501 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 &resunicode, newpos)) {
6503 Py_DECREF(restuple);
6504 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006506 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6507 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6508 Py_DECREF(restuple);
6509 return NULL;
6510 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006511 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006512 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006513 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6515 Py_DECREF(restuple);
6516 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006517 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006518 Py_INCREF(resunicode);
6519 Py_DECREF(restuple);
6520 return resunicode;
6521}
6522
Alexander Belopolsky40018472011-02-26 01:02:56 +00006523static PyObject *
6524unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006525 Py_ssize_t size,
6526 const char *errors,
6527 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528{
6529 /* output object */
6530 PyObject *res;
6531 /* pointers to the beginning and end+1 of input */
6532 const Py_UNICODE *startp = p;
6533 const Py_UNICODE *endp = p + size;
6534 /* pointer to the beginning of the unencodable characters */
6535 /* const Py_UNICODE *badp = NULL; */
6536 /* pointer into the output */
6537 char *str;
6538 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006539 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006540 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6541 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006542 PyObject *errorHandler = NULL;
6543 PyObject *exc = NULL;
6544 /* the following variable is used for caching string comparisons
6545 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6546 int known_errorHandler = -1;
6547
6548 /* allocate enough for a simple encoding without
6549 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006550 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006551 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006552 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006553 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006554 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006555 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 ressize = size;
6557
6558 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006559 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006560
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 /* can we encode this? */
6562 if (c<limit) {
6563 /* no overflow check, because we know that the space is enough */
6564 *str++ = (char)c;
6565 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006566 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 else {
6568 Py_ssize_t unicodepos = p-startp;
6569 Py_ssize_t requiredsize;
6570 PyObject *repunicode;
6571 Py_ssize_t repsize;
6572 Py_ssize_t newpos;
6573 Py_ssize_t respos;
6574 Py_UNICODE *uni2;
6575 /* startpos for collecting unencodable chars */
6576 const Py_UNICODE *collstart = p;
6577 const Py_UNICODE *collend = p;
6578 /* find all unecodable characters */
6579 while ((collend < endp) && ((*collend)>=limit))
6580 ++collend;
6581 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6582 if (known_errorHandler==-1) {
6583 if ((errors==NULL) || (!strcmp(errors, "strict")))
6584 known_errorHandler = 1;
6585 else if (!strcmp(errors, "replace"))
6586 known_errorHandler = 2;
6587 else if (!strcmp(errors, "ignore"))
6588 known_errorHandler = 3;
6589 else if (!strcmp(errors, "xmlcharrefreplace"))
6590 known_errorHandler = 4;
6591 else
6592 known_errorHandler = 0;
6593 }
6594 switch (known_errorHandler) {
6595 case 1: /* strict */
6596 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6597 goto onError;
6598 case 2: /* replace */
6599 while (collstart++<collend)
6600 *str++ = '?'; /* fall through */
6601 case 3: /* ignore */
6602 p = collend;
6603 break;
6604 case 4: /* xmlcharrefreplace */
6605 respos = str - PyBytes_AS_STRING(res);
6606 /* determine replacement size (temporarily (mis)uses p) */
6607 for (p = collstart, repsize = 0; p < collend; ++p) {
6608 if (*p<10)
6609 repsize += 2+1+1;
6610 else if (*p<100)
6611 repsize += 2+2+1;
6612 else if (*p<1000)
6613 repsize += 2+3+1;
6614 else if (*p<10000)
6615 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006616#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006617 else
6618 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006619#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 else if (*p<100000)
6621 repsize += 2+5+1;
6622 else if (*p<1000000)
6623 repsize += 2+6+1;
6624 else
6625 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006626#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 }
6628 requiredsize = respos+repsize+(endp-collend);
6629 if (requiredsize > ressize) {
6630 if (requiredsize<2*ressize)
6631 requiredsize = 2*ressize;
6632 if (_PyBytes_Resize(&res, requiredsize))
6633 goto onError;
6634 str = PyBytes_AS_STRING(res) + respos;
6635 ressize = requiredsize;
6636 }
6637 /* generate replacement (temporarily (mis)uses p) */
6638 for (p = collstart; p < collend; ++p) {
6639 str += sprintf(str, "&#%d;", (int)*p);
6640 }
6641 p = collend;
6642 break;
6643 default:
6644 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6645 encoding, reason, startp, size, &exc,
6646 collstart-startp, collend-startp, &newpos);
6647 if (repunicode == NULL)
6648 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006649 if (PyBytes_Check(repunicode)) {
6650 /* Directly copy bytes result to output. */
6651 repsize = PyBytes_Size(repunicode);
6652 if (repsize > 1) {
6653 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006654 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006655 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6656 Py_DECREF(repunicode);
6657 goto onError;
6658 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006659 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006660 ressize += repsize-1;
6661 }
6662 memcpy(str, PyBytes_AsString(repunicode), repsize);
6663 str += repsize;
6664 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006665 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006666 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006667 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006668 /* need more space? (at least enough for what we
6669 have+the replacement+the rest of the string, so
6670 we won't have to check space for encodable characters) */
6671 respos = str - PyBytes_AS_STRING(res);
6672 repsize = PyUnicode_GET_SIZE(repunicode);
6673 requiredsize = respos+repsize+(endp-collend);
6674 if (requiredsize > ressize) {
6675 if (requiredsize<2*ressize)
6676 requiredsize = 2*ressize;
6677 if (_PyBytes_Resize(&res, requiredsize)) {
6678 Py_DECREF(repunicode);
6679 goto onError;
6680 }
6681 str = PyBytes_AS_STRING(res) + respos;
6682 ressize = requiredsize;
6683 }
6684 /* check if there is anything unencodable in the replacement
6685 and copy it to the output */
6686 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6687 c = *uni2;
6688 if (c >= limit) {
6689 raise_encode_exception(&exc, encoding, startp, size,
6690 unicodepos, unicodepos+1, reason);
6691 Py_DECREF(repunicode);
6692 goto onError;
6693 }
6694 *str = (char)c;
6695 }
6696 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006697 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006698 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006699 }
6700 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006701 /* Resize if we allocated to much */
6702 size = str - PyBytes_AS_STRING(res);
6703 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006704 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006705 if (_PyBytes_Resize(&res, size) < 0)
6706 goto onError;
6707 }
6708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_XDECREF(errorHandler);
6710 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006711 return res;
6712
6713 onError:
6714 Py_XDECREF(res);
6715 Py_XDECREF(errorHandler);
6716 Py_XDECREF(exc);
6717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718}
6719
Alexander Belopolsky40018472011-02-26 01:02:56 +00006720PyObject *
6721PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006722 Py_ssize_t size,
6723 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006726}
6727
Alexander Belopolsky40018472011-02-26 01:02:56 +00006728PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006729_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730{
6731 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006732 PyErr_BadArgument();
6733 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006735 if (PyUnicode_READY(unicode) == -1)
6736 return NULL;
6737 /* Fast path: if it is a one-byte string, construct
6738 bytes object directly. */
6739 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6740 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6741 PyUnicode_GET_LENGTH(unicode));
6742 /* Non-Latin-1 characters present. Defer to above function to
6743 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006745 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006746 errors);
6747}
6748
6749PyObject*
6750PyUnicode_AsLatin1String(PyObject *unicode)
6751{
6752 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753}
6754
6755/* --- 7-bit ASCII Codec -------------------------------------------------- */
6756
Alexander Belopolsky40018472011-02-26 01:02:56 +00006757PyObject *
6758PyUnicode_DecodeASCII(const char *s,
6759 Py_ssize_t size,
6760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006762 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006764 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006765 Py_ssize_t startinpos;
6766 Py_ssize_t endinpos;
6767 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006769 int has_error;
6770 const unsigned char *p = (const unsigned char *)s;
6771 const unsigned char *end = p + size;
6772 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 PyObject *errorHandler = NULL;
6774 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006777 if (size == 1 && (unsigned char)s[0] < 128)
6778 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006779
Victor Stinner702c7342011-10-05 13:50:52 +02006780 has_error = 0;
6781 while (p < end && !has_error) {
6782 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6783 an explanation. */
6784 if (!((size_t) p & LONG_PTR_MASK)) {
6785 /* Help register allocation */
6786 register const unsigned char *_p = p;
6787 while (_p < aligned_end) {
6788 unsigned long value = *(unsigned long *) _p;
6789 if (value & ASCII_CHAR_MASK) {
6790 has_error = 1;
6791 break;
6792 }
6793 _p += SIZEOF_LONG;
6794 }
6795 if (_p == end)
6796 break;
6797 if (has_error)
6798 break;
6799 p = _p;
6800 }
6801 if (*p & 0x80) {
6802 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006803 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006804 }
6805 else {
6806 ++p;
6807 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006808 }
Victor Stinner702c7342011-10-05 13:50:52 +02006809 if (!has_error)
6810 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 v = _PyUnicode_New(size);
6813 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006815 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006817 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006818 e = s + size;
6819 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 register unsigned char c = (unsigned char)*s;
6821 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006822 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 ++s;
6824 }
6825 else {
6826 startinpos = s-starts;
6827 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006828 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 if (unicode_decode_call_errorhandler(
6830 errors, &errorHandler,
6831 "ascii", "ordinal not in range(128)",
6832 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006833 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 goto onError;
6835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006836 }
Victor Stinner702c7342011-10-05 13:50:52 +02006837 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6838 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006840 Py_XDECREF(errorHandler);
6841 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006842#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006843 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006844 Py_DECREF(v);
6845 return NULL;
6846 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006847#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006848 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006850
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006852 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853 Py_XDECREF(errorHandler);
6854 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855 return NULL;
6856}
6857
Alexander Belopolsky40018472011-02-26 01:02:56 +00006858PyObject *
6859PyUnicode_EncodeASCII(const Py_UNICODE *p,
6860 Py_ssize_t size,
6861 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006863 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864}
6865
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006867_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868{
6869 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006870 PyErr_BadArgument();
6871 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006873 if (PyUnicode_READY(unicode) == -1)
6874 return NULL;
6875 /* Fast path: if it is an ASCII-only string, construct bytes object
6876 directly. Else defer to above function to raise the exception. */
6877 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6878 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6879 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006882 errors);
6883}
6884
6885PyObject *
6886PyUnicode_AsASCIIString(PyObject *unicode)
6887{
6888 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889}
6890
Victor Stinner99b95382011-07-04 14:23:54 +02006891#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006892
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006893/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006894
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006895#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006896#define NEED_RETRY
6897#endif
6898
6899/* XXX This code is limited to "true" double-byte encodings, as
6900 a) it assumes an incomplete character consists of a single byte, and
6901 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006902 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006903
Alexander Belopolsky40018472011-02-26 01:02:56 +00006904static int
6905is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006906{
6907 const char *curr = s + offset;
6908
6909 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 const char *prev = CharPrev(s, curr);
6911 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006912 }
6913 return 0;
6914}
6915
6916/*
6917 * Decode MBCS string into unicode object. If 'final' is set, converts
6918 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6919 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006920static int
6921decode_mbcs(PyUnicodeObject **v,
6922 const char *s, /* MBCS string */
6923 int size, /* sizeof MBCS string */
6924 int final,
6925 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926{
6927 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006928 Py_ssize_t n;
6929 DWORD usize;
6930 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931
6932 assert(size >= 0);
6933
Victor Stinner554f3f02010-06-16 23:33:54 +00006934 /* check and handle 'errors' arg */
6935 if (errors==NULL || strcmp(errors, "strict")==0)
6936 flags = MB_ERR_INVALID_CHARS;
6937 else if (strcmp(errors, "ignore")==0)
6938 flags = 0;
6939 else {
6940 PyErr_Format(PyExc_ValueError,
6941 "mbcs encoding does not support errors='%s'",
6942 errors);
6943 return -1;
6944 }
6945
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006946 /* Skip trailing lead-byte unless 'final' is set */
6947 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006948 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006949
6950 /* First get the size of the result */
6951 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006952 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6953 if (usize==0)
6954 goto mbcs_decode_error;
6955 } else
6956 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957
6958 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 /* Create unicode object */
6960 *v = _PyUnicode_New(usize);
6961 if (*v == NULL)
6962 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006963 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964 }
6965 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006966 /* Extend unicode object */
6967 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006968 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006969 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970 }
6971
6972 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006973 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006975 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6976 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006980
6981mbcs_decode_error:
6982 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6983 we raise a UnicodeDecodeError - else it is a 'generic'
6984 windows error
6985 */
6986 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6987 /* Ideally, we should get reason from FormatMessage - this
6988 is the Windows 2000 English version of the message
6989 */
6990 PyObject *exc = NULL;
6991 const char *reason = "No mapping for the Unicode character exists "
6992 "in the target multi-byte code page.";
6993 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6994 if (exc != NULL) {
6995 PyCodec_StrictErrors(exc);
6996 Py_DECREF(exc);
6997 }
6998 } else {
6999 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7000 }
7001 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002}
7003
Alexander Belopolsky40018472011-02-26 01:02:56 +00007004PyObject *
7005PyUnicode_DecodeMBCSStateful(const char *s,
7006 Py_ssize_t size,
7007 const char *errors,
7008 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007009{
7010 PyUnicodeObject *v = NULL;
7011 int done;
7012
7013 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007014 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007015
7016#ifdef NEED_RETRY
7017 retry:
7018 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007019 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020 else
7021#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007022 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023
7024 if (done < 0) {
7025 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007027 }
7028
7029 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007030 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031
7032#ifdef NEED_RETRY
7033 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 s += done;
7035 size -= done;
7036 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007037 }
7038#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02007039#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007040 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007041 Py_DECREF(v);
7042 return NULL;
7043 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007044#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007045 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007046 return (PyObject *)v;
7047}
7048
Alexander Belopolsky40018472011-02-26 01:02:56 +00007049PyObject *
7050PyUnicode_DecodeMBCS(const char *s,
7051 Py_ssize_t size,
7052 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007053{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007054 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7055}
7056
7057/*
7058 * Convert unicode into string object (MBCS).
7059 * Returns 0 if succeed, -1 otherwise.
7060 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007061static int
7062encode_mbcs(PyObject **repr,
7063 const Py_UNICODE *p, /* unicode */
7064 int size, /* size of unicode */
7065 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007066{
Victor Stinner554f3f02010-06-16 23:33:54 +00007067 BOOL usedDefaultChar = FALSE;
7068 BOOL *pusedDefaultChar;
7069 int mbcssize;
7070 Py_ssize_t n;
7071 PyObject *exc = NULL;
7072 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007073
7074 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007075
Victor Stinner554f3f02010-06-16 23:33:54 +00007076 /* check and handle 'errors' arg */
7077 if (errors==NULL || strcmp(errors, "strict")==0) {
7078 flags = WC_NO_BEST_FIT_CHARS;
7079 pusedDefaultChar = &usedDefaultChar;
7080 } else if (strcmp(errors, "replace")==0) {
7081 flags = 0;
7082 pusedDefaultChar = NULL;
7083 } else {
7084 PyErr_Format(PyExc_ValueError,
7085 "mbcs encoding does not support errors='%s'",
7086 errors);
7087 return -1;
7088 }
7089
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007090 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007091 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007092 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7093 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 if (mbcssize == 0) {
7095 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7096 return -1;
7097 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007098 /* If we used a default char, then we failed! */
7099 if (pusedDefaultChar && *pusedDefaultChar)
7100 goto mbcs_encode_error;
7101 } else {
7102 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007103 }
7104
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007106 /* Create string object */
7107 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7108 if (*repr == NULL)
7109 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007110 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111 }
7112 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007113 /* Extend string object */
7114 n = PyBytes_Size(*repr);
7115 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7116 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007117 }
7118
7119 /* Do the conversion */
7120 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007122 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7123 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7125 return -1;
7126 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007127 if (pusedDefaultChar && *pusedDefaultChar)
7128 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007130 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007131
7132mbcs_encode_error:
7133 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7134 Py_XDECREF(exc);
7135 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007136}
7137
Alexander Belopolsky40018472011-02-26 01:02:56 +00007138PyObject *
7139PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7140 Py_ssize_t size,
7141 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007142{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007143 PyObject *repr = NULL;
7144 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007145
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007146#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007149 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007150 else
7151#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007152 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 Py_XDECREF(repr);
7156 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007157 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007158
7159#ifdef NEED_RETRY
7160 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007161 p += INT_MAX;
7162 size -= INT_MAX;
7163 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164 }
7165#endif
7166
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007167 return repr;
7168}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007169
Alexander Belopolsky40018472011-02-26 01:02:56 +00007170PyObject *
7171PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007172{
7173 if (!PyUnicode_Check(unicode)) {
7174 PyErr_BadArgument();
7175 return NULL;
7176 }
7177 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 PyUnicode_GET_SIZE(unicode),
7179 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007180}
7181
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182#undef NEED_RETRY
7183
Victor Stinner99b95382011-07-04 14:23:54 +02007184#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007185
Guido van Rossumd57fd912000-03-10 22:53:23 +00007186/* --- Character Mapping Codec -------------------------------------------- */
7187
Alexander Belopolsky40018472011-02-26 01:02:56 +00007188PyObject *
7189PyUnicode_DecodeCharmap(const char *s,
7190 Py_ssize_t size,
7191 PyObject *mapping,
7192 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007194 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007195 Py_ssize_t startinpos;
7196 Py_ssize_t endinpos;
7197 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007198 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 PyUnicodeObject *v;
7200 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007201 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 PyObject *errorHandler = NULL;
7203 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007204 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007205 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007206
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 /* Default to Latin-1 */
7208 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007209 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210
7211 v = _PyUnicode_New(size);
7212 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007213 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007214 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007217 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007218 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007219 mapstring = PyUnicode_AS_UNICODE(mapping);
7220 maplen = PyUnicode_GET_SIZE(mapping);
7221 while (s < e) {
7222 unsigned char ch = *s;
7223 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224
Benjamin Peterson29060642009-01-31 22:14:21 +00007225 if (ch < maplen)
7226 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227
Benjamin Peterson29060642009-01-31 22:14:21 +00007228 if (x == 0xfffe) {
7229 /* undefined mapping */
7230 outpos = p-PyUnicode_AS_UNICODE(v);
7231 startinpos = s-starts;
7232 endinpos = startinpos+1;
7233 if (unicode_decode_call_errorhandler(
7234 errors, &errorHandler,
7235 "charmap", "character maps to <undefined>",
7236 &starts, &e, &startinpos, &endinpos, &exc, &s,
7237 &v, &outpos, &p)) {
7238 goto onError;
7239 }
7240 continue;
7241 }
7242 *p++ = x;
7243 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007244 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007245 }
7246 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007247 while (s < e) {
7248 unsigned char ch = *s;
7249 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007250
Benjamin Peterson29060642009-01-31 22:14:21 +00007251 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7252 w = PyLong_FromLong((long)ch);
7253 if (w == NULL)
7254 goto onError;
7255 x = PyObject_GetItem(mapping, w);
7256 Py_DECREF(w);
7257 if (x == NULL) {
7258 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7259 /* No mapping found means: mapping is undefined. */
7260 PyErr_Clear();
7261 x = Py_None;
7262 Py_INCREF(x);
7263 } else
7264 goto onError;
7265 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007266
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 /* Apply mapping */
7268 if (PyLong_Check(x)) {
7269 long value = PyLong_AS_LONG(x);
7270 if (value < 0 || value > 65535) {
7271 PyErr_SetString(PyExc_TypeError,
7272 "character mapping must be in range(65536)");
7273 Py_DECREF(x);
7274 goto onError;
7275 }
7276 *p++ = (Py_UNICODE)value;
7277 }
7278 else if (x == Py_None) {
7279 /* undefined mapping */
7280 outpos = p-PyUnicode_AS_UNICODE(v);
7281 startinpos = s-starts;
7282 endinpos = startinpos+1;
7283 if (unicode_decode_call_errorhandler(
7284 errors, &errorHandler,
7285 "charmap", "character maps to <undefined>",
7286 &starts, &e, &startinpos, &endinpos, &exc, &s,
7287 &v, &outpos, &p)) {
7288 Py_DECREF(x);
7289 goto onError;
7290 }
7291 Py_DECREF(x);
7292 continue;
7293 }
7294 else if (PyUnicode_Check(x)) {
7295 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007296
Benjamin Peterson29060642009-01-31 22:14:21 +00007297 if (targetsize == 1)
7298 /* 1-1 mapping */
7299 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007300
Benjamin Peterson29060642009-01-31 22:14:21 +00007301 else if (targetsize > 1) {
7302 /* 1-n mapping */
7303 if (targetsize > extrachars) {
7304 /* resize first */
7305 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7306 Py_ssize_t needed = (targetsize - extrachars) + \
7307 (targetsize << 2);
7308 extrachars += needed;
7309 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007310 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 PyUnicode_GET_SIZE(v) + needed) < 0) {
7312 Py_DECREF(x);
7313 goto onError;
7314 }
7315 p = PyUnicode_AS_UNICODE(v) + oldpos;
7316 }
7317 Py_UNICODE_COPY(p,
7318 PyUnicode_AS_UNICODE(x),
7319 targetsize);
7320 p += targetsize;
7321 extrachars -= targetsize;
7322 }
7323 /* 1-0 mapping: skip the character */
7324 }
7325 else {
7326 /* wrong return value */
7327 PyErr_SetString(PyExc_TypeError,
7328 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007329 Py_DECREF(x);
7330 goto onError;
7331 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 Py_DECREF(x);
7333 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007334 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007335 }
7336 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007337 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007338 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007339 Py_XDECREF(errorHandler);
7340 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007341#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007342 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007343 Py_DECREF(v);
7344 return NULL;
7345 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007346#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007347 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007349
Benjamin Peterson29060642009-01-31 22:14:21 +00007350 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007351 Py_XDECREF(errorHandler);
7352 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007353 Py_XDECREF(v);
7354 return NULL;
7355}
7356
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007357/* Charmap encoding: the lookup table */
7358
Alexander Belopolsky40018472011-02-26 01:02:56 +00007359struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007360 PyObject_HEAD
7361 unsigned char level1[32];
7362 int count2, count3;
7363 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007364};
7365
7366static PyObject*
7367encoding_map_size(PyObject *obj, PyObject* args)
7368{
7369 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007370 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007371 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007372}
7373
7374static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007375 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 PyDoc_STR("Return the size (in bytes) of this object") },
7377 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007378};
7379
7380static void
7381encoding_map_dealloc(PyObject* o)
7382{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007384}
7385
7386static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007387 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 "EncodingMap", /*tp_name*/
7389 sizeof(struct encoding_map), /*tp_basicsize*/
7390 0, /*tp_itemsize*/
7391 /* methods */
7392 encoding_map_dealloc, /*tp_dealloc*/
7393 0, /*tp_print*/
7394 0, /*tp_getattr*/
7395 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007396 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007397 0, /*tp_repr*/
7398 0, /*tp_as_number*/
7399 0, /*tp_as_sequence*/
7400 0, /*tp_as_mapping*/
7401 0, /*tp_hash*/
7402 0, /*tp_call*/
7403 0, /*tp_str*/
7404 0, /*tp_getattro*/
7405 0, /*tp_setattro*/
7406 0, /*tp_as_buffer*/
7407 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7408 0, /*tp_doc*/
7409 0, /*tp_traverse*/
7410 0, /*tp_clear*/
7411 0, /*tp_richcompare*/
7412 0, /*tp_weaklistoffset*/
7413 0, /*tp_iter*/
7414 0, /*tp_iternext*/
7415 encoding_map_methods, /*tp_methods*/
7416 0, /*tp_members*/
7417 0, /*tp_getset*/
7418 0, /*tp_base*/
7419 0, /*tp_dict*/
7420 0, /*tp_descr_get*/
7421 0, /*tp_descr_set*/
7422 0, /*tp_dictoffset*/
7423 0, /*tp_init*/
7424 0, /*tp_alloc*/
7425 0, /*tp_new*/
7426 0, /*tp_free*/
7427 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007428};
7429
7430PyObject*
7431PyUnicode_BuildEncodingMap(PyObject* string)
7432{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007433 PyObject *result;
7434 struct encoding_map *mresult;
7435 int i;
7436 int need_dict = 0;
7437 unsigned char level1[32];
7438 unsigned char level2[512];
7439 unsigned char *mlevel1, *mlevel2, *mlevel3;
7440 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007441 int kind;
7442 void *data;
7443 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007445 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007446 PyErr_BadArgument();
7447 return NULL;
7448 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007449 kind = PyUnicode_KIND(string);
7450 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007451 memset(level1, 0xFF, sizeof level1);
7452 memset(level2, 0xFF, sizeof level2);
7453
7454 /* If there isn't a one-to-one mapping of NULL to \0,
7455 or if there are non-BMP characters, we need to use
7456 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007457 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007458 need_dict = 1;
7459 for (i = 1; i < 256; i++) {
7460 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007461 ch = PyUnicode_READ(kind, data, i);
7462 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007463 need_dict = 1;
7464 break;
7465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007466 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007467 /* unmapped character */
7468 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 l1 = ch >> 11;
7470 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007471 if (level1[l1] == 0xFF)
7472 level1[l1] = count2++;
7473 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007474 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007475 }
7476
7477 if (count2 >= 0xFF || count3 >= 0xFF)
7478 need_dict = 1;
7479
7480 if (need_dict) {
7481 PyObject *result = PyDict_New();
7482 PyObject *key, *value;
7483 if (!result)
7484 return NULL;
7485 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007486 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007487 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007488 if (!key || !value)
7489 goto failed1;
7490 if (PyDict_SetItem(result, key, value) == -1)
7491 goto failed1;
7492 Py_DECREF(key);
7493 Py_DECREF(value);
7494 }
7495 return result;
7496 failed1:
7497 Py_XDECREF(key);
7498 Py_XDECREF(value);
7499 Py_DECREF(result);
7500 return NULL;
7501 }
7502
7503 /* Create a three-level trie */
7504 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7505 16*count2 + 128*count3 - 1);
7506 if (!result)
7507 return PyErr_NoMemory();
7508 PyObject_Init(result, &EncodingMapType);
7509 mresult = (struct encoding_map*)result;
7510 mresult->count2 = count2;
7511 mresult->count3 = count3;
7512 mlevel1 = mresult->level1;
7513 mlevel2 = mresult->level23;
7514 mlevel3 = mresult->level23 + 16*count2;
7515 memcpy(mlevel1, level1, 32);
7516 memset(mlevel2, 0xFF, 16*count2);
7517 memset(mlevel3, 0, 128*count3);
7518 count3 = 0;
7519 for (i = 1; i < 256; i++) {
7520 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007521 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007522 /* unmapped character */
7523 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007524 o1 = PyUnicode_READ(kind, data, i)>>11;
7525 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007526 i2 = 16*mlevel1[o1] + o2;
7527 if (mlevel2[i2] == 0xFF)
7528 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007530 i3 = 128*mlevel2[i2] + o3;
7531 mlevel3[i3] = i;
7532 }
7533 return result;
7534}
7535
7536static int
7537encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7538{
7539 struct encoding_map *map = (struct encoding_map*)mapping;
7540 int l1 = c>>11;
7541 int l2 = (c>>7) & 0xF;
7542 int l3 = c & 0x7F;
7543 int i;
7544
7545#ifdef Py_UNICODE_WIDE
7546 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007547 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007548 }
7549#endif
7550 if (c == 0)
7551 return 0;
7552 /* level 1*/
7553 i = map->level1[l1];
7554 if (i == 0xFF) {
7555 return -1;
7556 }
7557 /* level 2*/
7558 i = map->level23[16*i+l2];
7559 if (i == 0xFF) {
7560 return -1;
7561 }
7562 /* level 3 */
7563 i = map->level23[16*map->count2 + 128*i + l3];
7564 if (i == 0) {
7565 return -1;
7566 }
7567 return i;
7568}
7569
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007570/* Lookup the character ch in the mapping. If the character
7571 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007572 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007573static PyObject *
7574charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007575{
Christian Heimes217cfd12007-12-02 14:31:20 +00007576 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007577 PyObject *x;
7578
7579 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007580 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007581 x = PyObject_GetItem(mapping, w);
7582 Py_DECREF(w);
7583 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007584 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7585 /* No mapping found means: mapping is undefined. */
7586 PyErr_Clear();
7587 x = Py_None;
7588 Py_INCREF(x);
7589 return x;
7590 } else
7591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007593 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007595 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 long value = PyLong_AS_LONG(x);
7597 if (value < 0 || value > 255) {
7598 PyErr_SetString(PyExc_TypeError,
7599 "character mapping must be in range(256)");
7600 Py_DECREF(x);
7601 return NULL;
7602 }
7603 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007604 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007605 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007606 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007607 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007608 /* wrong return value */
7609 PyErr_Format(PyExc_TypeError,
7610 "character mapping must return integer, bytes or None, not %.400s",
7611 x->ob_type->tp_name);
7612 Py_DECREF(x);
7613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007614 }
7615}
7616
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007617static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007618charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007619{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007620 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7621 /* exponentially overallocate to minimize reallocations */
7622 if (requiredsize < 2*outsize)
7623 requiredsize = 2*outsize;
7624 if (_PyBytes_Resize(outobj, requiredsize))
7625 return -1;
7626 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627}
7628
Benjamin Peterson14339b62009-01-31 16:36:08 +00007629typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007630 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007631} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007632/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007633 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007634 space is available. Return a new reference to the object that
7635 was put in the output buffer, or Py_None, if the mapping was undefined
7636 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007637 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007638static charmapencode_result
7639charmapencode_output(Py_UNICODE c, PyObject *mapping,
7640 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007642 PyObject *rep;
7643 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007644 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645
Christian Heimes90aa7642007-12-19 02:45:37 +00007646 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007647 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007649 if (res == -1)
7650 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007651 if (outsize<requiredsize)
7652 if (charmapencode_resize(outobj, outpos, requiredsize))
7653 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007654 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 outstart[(*outpos)++] = (char)res;
7656 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007657 }
7658
7659 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007660 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007661 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007662 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 Py_DECREF(rep);
7664 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007665 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007666 if (PyLong_Check(rep)) {
7667 Py_ssize_t requiredsize = *outpos+1;
7668 if (outsize<requiredsize)
7669 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7670 Py_DECREF(rep);
7671 return enc_EXCEPTION;
7672 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007673 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 else {
7677 const char *repchars = PyBytes_AS_STRING(rep);
7678 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7679 Py_ssize_t requiredsize = *outpos+repsize;
7680 if (outsize<requiredsize)
7681 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7682 Py_DECREF(rep);
7683 return enc_EXCEPTION;
7684 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007685 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007686 memcpy(outstart + *outpos, repchars, repsize);
7687 *outpos += repsize;
7688 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007690 Py_DECREF(rep);
7691 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007692}
7693
7694/* handle an error in PyUnicode_EncodeCharmap
7695 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007696static int
7697charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007698 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007699 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007700 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007701 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007702{
7703 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007704 Py_ssize_t repsize;
7705 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007706 Py_UNICODE *uni2;
7707 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007708 Py_ssize_t collstartpos = *inpos;
7709 Py_ssize_t collendpos = *inpos+1;
7710 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007711 char *encoding = "charmap";
7712 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007713 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007715 /* find all unencodable characters */
7716 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007717 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007718 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007719 int res = encoding_map_lookup(p[collendpos], mapping);
7720 if (res != -1)
7721 break;
7722 ++collendpos;
7723 continue;
7724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 rep = charmapencode_lookup(p[collendpos], mapping);
7727 if (rep==NULL)
7728 return -1;
7729 else if (rep!=Py_None) {
7730 Py_DECREF(rep);
7731 break;
7732 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007735 }
7736 /* cache callback name lookup
7737 * (if not done yet, i.e. it's the first error) */
7738 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 if ((errors==NULL) || (!strcmp(errors, "strict")))
7740 *known_errorHandler = 1;
7741 else if (!strcmp(errors, "replace"))
7742 *known_errorHandler = 2;
7743 else if (!strcmp(errors, "ignore"))
7744 *known_errorHandler = 3;
7745 else if (!strcmp(errors, "xmlcharrefreplace"))
7746 *known_errorHandler = 4;
7747 else
7748 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007749 }
7750 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007751 case 1: /* strict */
7752 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7753 return -1;
7754 case 2: /* replace */
7755 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 x = charmapencode_output('?', mapping, res, respos);
7757 if (x==enc_EXCEPTION) {
7758 return -1;
7759 }
7760 else if (x==enc_FAILED) {
7761 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7762 return -1;
7763 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007764 }
7765 /* fall through */
7766 case 3: /* ignore */
7767 *inpos = collendpos;
7768 break;
7769 case 4: /* xmlcharrefreplace */
7770 /* generate replacement (temporarily (mis)uses p) */
7771 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007772 char buffer[2+29+1+1];
7773 char *cp;
7774 sprintf(buffer, "&#%d;", (int)p[collpos]);
7775 for (cp = buffer; *cp; ++cp) {
7776 x = charmapencode_output(*cp, mapping, res, respos);
7777 if (x==enc_EXCEPTION)
7778 return -1;
7779 else if (x==enc_FAILED) {
7780 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7781 return -1;
7782 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007783 }
7784 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007785 *inpos = collendpos;
7786 break;
7787 default:
7788 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 encoding, reason, p, size, exceptionObject,
7790 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007791 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007793 if (PyBytes_Check(repunicode)) {
7794 /* Directly copy bytes result to output. */
7795 Py_ssize_t outsize = PyBytes_Size(*res);
7796 Py_ssize_t requiredsize;
7797 repsize = PyBytes_Size(repunicode);
7798 requiredsize = *respos + repsize;
7799 if (requiredsize > outsize)
7800 /* Make room for all additional bytes. */
7801 if (charmapencode_resize(res, respos, requiredsize)) {
7802 Py_DECREF(repunicode);
7803 return -1;
7804 }
7805 memcpy(PyBytes_AsString(*res) + *respos,
7806 PyBytes_AsString(repunicode), repsize);
7807 *respos += repsize;
7808 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007809 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007810 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007811 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007812 /* generate replacement */
7813 repsize = PyUnicode_GET_SIZE(repunicode);
7814 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 x = charmapencode_output(*uni2, mapping, res, respos);
7816 if (x==enc_EXCEPTION) {
7817 return -1;
7818 }
7819 else if (x==enc_FAILED) {
7820 Py_DECREF(repunicode);
7821 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7822 return -1;
7823 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007824 }
7825 *inpos = newpos;
7826 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007827 }
7828 return 0;
7829}
7830
Alexander Belopolsky40018472011-02-26 01:02:56 +00007831PyObject *
7832PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7833 Py_ssize_t size,
7834 PyObject *mapping,
7835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007837 /* output object */
7838 PyObject *res = NULL;
7839 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007840 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007841 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007842 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007843 PyObject *errorHandler = NULL;
7844 PyObject *exc = NULL;
7845 /* the following variable is used for caching string comparisons
7846 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7847 * 3=ignore, 4=xmlcharrefreplace */
7848 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007849
7850 /* Default to Latin-1 */
7851 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007852 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007853
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007854 /* allocate enough for a simple encoding without
7855 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007856 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007857 if (res == NULL)
7858 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007859 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007862 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007863 /* try to encode it */
7864 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7865 if (x==enc_EXCEPTION) /* error */
7866 goto onError;
7867 if (x==enc_FAILED) { /* unencodable character */
7868 if (charmap_encoding_error(p, size, &inpos, mapping,
7869 &exc,
7870 &known_errorHandler, &errorHandler, errors,
7871 &res, &respos)) {
7872 goto onError;
7873 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007874 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007875 else
7876 /* done with this character => adjust input position */
7877 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007879
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007880 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007881 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007882 if (_PyBytes_Resize(&res, respos) < 0)
7883 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007884
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007885 Py_XDECREF(exc);
7886 Py_XDECREF(errorHandler);
7887 return res;
7888
Benjamin Peterson29060642009-01-31 22:14:21 +00007889 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007890 Py_XDECREF(res);
7891 Py_XDECREF(exc);
7892 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007893 return NULL;
7894}
7895
Alexander Belopolsky40018472011-02-26 01:02:56 +00007896PyObject *
7897PyUnicode_AsCharmapString(PyObject *unicode,
7898 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007899{
7900 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007901 PyErr_BadArgument();
7902 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007903 }
7904 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007905 PyUnicode_GET_SIZE(unicode),
7906 mapping,
7907 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007908}
7909
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007910/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007911static void
7912make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007913 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007914 Py_ssize_t startpos, Py_ssize_t endpos,
7915 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007917 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007918 *exceptionObject = _PyUnicodeTranslateError_Create(
7919 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007920 }
7921 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7923 goto onError;
7924 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7925 goto onError;
7926 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7927 goto onError;
7928 return;
7929 onError:
7930 Py_DECREF(*exceptionObject);
7931 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007932 }
7933}
7934
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007935/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007936static void
7937raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007938 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007939 Py_ssize_t startpos, Py_ssize_t endpos,
7940 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941{
7942 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007945 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007946}
7947
7948/* error handling callback helper:
7949 build arguments, call the callback and check the arguments,
7950 put the result into newpos and return the replacement string, which
7951 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007952static PyObject *
7953unicode_translate_call_errorhandler(const char *errors,
7954 PyObject **errorHandler,
7955 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007956 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007957 Py_ssize_t startpos, Py_ssize_t endpos,
7958 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007959{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007960 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007961
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007962 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007963 PyObject *restuple;
7964 PyObject *resunicode;
7965
7966 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007967 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007968 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007970 }
7971
7972 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007973 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007974 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976
7977 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007978 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007979 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007981 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007982 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 Py_DECREF(restuple);
7984 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007985 }
7986 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 &resunicode, &i_newpos)) {
7988 Py_DECREF(restuple);
7989 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007991 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007993 else
7994 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007995 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007996 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7997 Py_DECREF(restuple);
7998 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007999 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 Py_INCREF(resunicode);
8001 Py_DECREF(restuple);
8002 return resunicode;
8003}
8004
8005/* Lookup the character ch in the mapping and put the result in result,
8006 which must be decrefed by the caller.
8007 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008008static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008009charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008010{
Christian Heimes217cfd12007-12-02 14:31:20 +00008011 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008012 PyObject *x;
8013
8014 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008016 x = PyObject_GetItem(mapping, w);
8017 Py_DECREF(w);
8018 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008019 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8020 /* No mapping found means: use 1:1 mapping. */
8021 PyErr_Clear();
8022 *result = NULL;
8023 return 0;
8024 } else
8025 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008026 }
8027 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 *result = x;
8029 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008031 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 long value = PyLong_AS_LONG(x);
8033 long max = PyUnicode_GetMax();
8034 if (value < 0 || value > max) {
8035 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008036 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008037 Py_DECREF(x);
8038 return -1;
8039 }
8040 *result = x;
8041 return 0;
8042 }
8043 else if (PyUnicode_Check(x)) {
8044 *result = x;
8045 return 0;
8046 }
8047 else {
8048 /* wrong return value */
8049 PyErr_SetString(PyExc_TypeError,
8050 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008051 Py_DECREF(x);
8052 return -1;
8053 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008054}
8055/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 if not reallocate and adjust various state variables.
8057 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008059charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008062 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008063 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 /* exponentially overallocate to minimize reallocations */
8065 if (requiredsize < 2 * oldsize)
8066 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008067 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8068 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008071 }
8072 return 0;
8073}
8074/* lookup the character, put the result in the output string and adjust
8075 various state variables. Return a new reference to the object that
8076 was put in the output buffer in *result, or Py_None, if the mapping was
8077 undefined (in which case no character was written).
8078 The called must decref result.
8079 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008080static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008081charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8082 PyObject *mapping, Py_UCS4 **output,
8083 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008084 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008085{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008086 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8087 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008089 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008091 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092 }
8093 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008095 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008097 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008098 }
8099 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008100 Py_ssize_t repsize;
8101 if (PyUnicode_READY(*res) == -1)
8102 return -1;
8103 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 if (repsize==1) {
8105 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008106 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 }
8108 else if (repsize!=0) {
8109 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008110 Py_ssize_t requiredsize = *opos +
8111 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008113 Py_ssize_t i;
8114 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008116 for(i = 0; i < repsize; i++)
8117 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 }
8120 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122 return 0;
8123}
8124
Alexander Belopolsky40018472011-02-26 01:02:56 +00008125PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008126_PyUnicode_TranslateCharmap(PyObject *input,
8127 PyObject *mapping,
8128 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008130 /* input object */
8131 char *idata;
8132 Py_ssize_t size, i;
8133 int kind;
8134 /* output buffer */
8135 Py_UCS4 *output = NULL;
8136 Py_ssize_t osize;
8137 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008138 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008139 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008140 char *reason = "character maps to <undefined>";
8141 PyObject *errorHandler = NULL;
8142 PyObject *exc = NULL;
8143 /* the following variable is used for caching string comparisons
8144 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8145 * 3=ignore, 4=xmlcharrefreplace */
8146 int known_errorHandler = -1;
8147
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 PyErr_BadArgument();
8150 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008153 if (PyUnicode_READY(input) == -1)
8154 return NULL;
8155 idata = (char*)PyUnicode_DATA(input);
8156 kind = PyUnicode_KIND(input);
8157 size = PyUnicode_GET_LENGTH(input);
8158 i = 0;
8159
8160 if (size == 0) {
8161 Py_INCREF(input);
8162 return input;
8163 }
8164
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 /* allocate enough for a simple 1:1 translation without
8166 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008167 osize = size;
8168 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8169 opos = 0;
8170 if (output == NULL) {
8171 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008172 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008173 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 /* try to encode it */
8177 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008178 if (charmaptranslate_output(input, i, mapping,
8179 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 Py_XDECREF(x);
8181 goto onError;
8182 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 else { /* untranslatable character */
8187 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8188 Py_ssize_t repsize;
8189 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 Py_ssize_t collstart = i;
8193 Py_ssize_t collend = i+1;
8194 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195
Benjamin Peterson29060642009-01-31 22:14:21 +00008196 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008197 while (collend < size) {
8198 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 goto onError;
8200 Py_XDECREF(x);
8201 if (x!=Py_None)
8202 break;
8203 ++collend;
8204 }
8205 /* cache callback name lookup
8206 * (if not done yet, i.e. it's the first error) */
8207 if (known_errorHandler==-1) {
8208 if ((errors==NULL) || (!strcmp(errors, "strict")))
8209 known_errorHandler = 1;
8210 else if (!strcmp(errors, "replace"))
8211 known_errorHandler = 2;
8212 else if (!strcmp(errors, "ignore"))
8213 known_errorHandler = 3;
8214 else if (!strcmp(errors, "xmlcharrefreplace"))
8215 known_errorHandler = 4;
8216 else
8217 known_errorHandler = 0;
8218 }
8219 switch (known_errorHandler) {
8220 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008221 raise_translate_exception(&exc, input, collstart,
8222 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008224 case 2: /* replace */
8225 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008226 for (coll = collstart; coll<collend; coll++)
8227 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 /* fall through */
8229 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008230 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 break;
8232 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008233 /* generate replacement (temporarily (mis)uses i) */
8234 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 char buffer[2+29+1+1];
8236 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008237 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8238 if (charmaptranslate_makespace(&output, &osize,
8239 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008240 goto onError;
8241 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008242 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 break;
8246 default:
8247 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 reason, input, &exc,
8249 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008250 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 goto onError;
8252 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 repsize = PyUnicode_GET_LENGTH(repunicode);
8254 if (charmaptranslate_makespace(&output, &osize,
8255 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008256 Py_DECREF(repunicode);
8257 goto onError;
8258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008259 for (uni2 = 0; repsize-->0; ++uni2)
8260 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8261 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008264 }
8265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008266 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8267 if (!res)
8268 goto onError;
8269 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008270 Py_XDECREF(exc);
8271 Py_XDECREF(errorHandler);
8272 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008273
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008276 Py_XDECREF(exc);
8277 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008278 return NULL;
8279}
8280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281/* Deprecated. Use PyUnicode_Translate instead. */
8282PyObject *
8283PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8284 Py_ssize_t size,
8285 PyObject *mapping,
8286 const char *errors)
8287{
8288 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8289 if (!unicode)
8290 return NULL;
8291 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8292}
8293
Alexander Belopolsky40018472011-02-26 01:02:56 +00008294PyObject *
8295PyUnicode_Translate(PyObject *str,
8296 PyObject *mapping,
8297 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
8299 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008300
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 str = PyUnicode_FromObject(str);
8302 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305 Py_DECREF(str);
8306 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008307
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 Py_XDECREF(str);
8310 return NULL;
8311}
Tim Petersced69f82003-09-16 20:30:58 +00008312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008314fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008315{
8316 /* No need to call PyUnicode_READY(self) because this function is only
8317 called as a callback from fixup() which does it already. */
8318 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8319 const int kind = PyUnicode_KIND(self);
8320 void *data = PyUnicode_DATA(self);
8321 Py_UCS4 maxchar = 0, ch, fixed;
8322 Py_ssize_t i;
8323
8324 for (i = 0; i < len; ++i) {
8325 ch = PyUnicode_READ(kind, data, i);
8326 fixed = 0;
8327 if (ch > 127) {
8328 if (Py_UNICODE_ISSPACE(ch))
8329 fixed = ' ';
8330 else {
8331 const int decimal = Py_UNICODE_TODECIMAL(ch);
8332 if (decimal >= 0)
8333 fixed = '0' + decimal;
8334 }
8335 if (fixed != 0) {
8336 if (fixed > maxchar)
8337 maxchar = fixed;
8338 PyUnicode_WRITE(kind, data, i, fixed);
8339 }
8340 else if (ch > maxchar)
8341 maxchar = ch;
8342 }
8343 else if (ch > maxchar)
8344 maxchar = ch;
8345 }
8346
8347 return maxchar;
8348}
8349
8350PyObject *
8351_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8352{
8353 if (!PyUnicode_Check(unicode)) {
8354 PyErr_BadInternalCall();
8355 return NULL;
8356 }
8357 if (PyUnicode_READY(unicode) == -1)
8358 return NULL;
8359 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8360 /* If the string is already ASCII, just return the same string */
8361 Py_INCREF(unicode);
8362 return unicode;
8363 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008364 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008365}
8366
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008367PyObject *
8368PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8369 Py_ssize_t length)
8370{
8371 PyObject *result;
8372 Py_UNICODE *p; /* write pointer into result */
8373 Py_ssize_t i;
8374 /* Copy to a new string */
8375 result = (PyObject *)_PyUnicode_New(length);
8376 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8377 if (result == NULL)
8378 return result;
8379 p = PyUnicode_AS_UNICODE(result);
8380 /* Iterate over code points */
8381 for (i = 0; i < length; i++) {
8382 Py_UNICODE ch =s[i];
8383 if (ch > 127) {
8384 int decimal = Py_UNICODE_TODECIMAL(ch);
8385 if (decimal >= 0)
8386 p[i] = '0' + decimal;
8387 }
8388 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008389#ifndef DONT_MAKE_RESULT_READY
8390 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008391 Py_DECREF(result);
8392 return NULL;
8393 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008394#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008395 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008396 return result;
8397}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008398/* --- Decimal Encoder ---------------------------------------------------- */
8399
Alexander Belopolsky40018472011-02-26 01:02:56 +00008400int
8401PyUnicode_EncodeDecimal(Py_UNICODE *s,
8402 Py_ssize_t length,
8403 char *output,
8404 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008405{
8406 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 PyObject *errorHandler = NULL;
8408 PyObject *exc = NULL;
8409 const char *encoding = "decimal";
8410 const char *reason = "invalid decimal Unicode string";
8411 /* the following variable is used for caching string comparisons
8412 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8413 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008414
8415 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008416 PyErr_BadArgument();
8417 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008418 }
8419
8420 p = s;
8421 end = s + length;
8422 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 register Py_UNICODE ch = *p;
8424 int decimal;
8425 PyObject *repunicode;
8426 Py_ssize_t repsize;
8427 Py_ssize_t newpos;
8428 Py_UNICODE *uni2;
8429 Py_UNICODE *collstart;
8430 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008431
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 ++p;
8435 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 decimal = Py_UNICODE_TODECIMAL(ch);
8438 if (decimal >= 0) {
8439 *output++ = '0' + decimal;
8440 ++p;
8441 continue;
8442 }
8443 if (0 < ch && ch < 256) {
8444 *output++ = (char)ch;
8445 ++p;
8446 continue;
8447 }
8448 /* All other characters are considered unencodable */
8449 collstart = p;
8450 collend = p+1;
8451 while (collend < end) {
8452 if ((0 < *collend && *collend < 256) ||
8453 !Py_UNICODE_ISSPACE(*collend) ||
8454 Py_UNICODE_TODECIMAL(*collend))
8455 break;
8456 }
8457 /* cache callback name lookup
8458 * (if not done yet, i.e. it's the first error) */
8459 if (known_errorHandler==-1) {
8460 if ((errors==NULL) || (!strcmp(errors, "strict")))
8461 known_errorHandler = 1;
8462 else if (!strcmp(errors, "replace"))
8463 known_errorHandler = 2;
8464 else if (!strcmp(errors, "ignore"))
8465 known_errorHandler = 3;
8466 else if (!strcmp(errors, "xmlcharrefreplace"))
8467 known_errorHandler = 4;
8468 else
8469 known_errorHandler = 0;
8470 }
8471 switch (known_errorHandler) {
8472 case 1: /* strict */
8473 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8474 goto onError;
8475 case 2: /* replace */
8476 for (p = collstart; p < collend; ++p)
8477 *output++ = '?';
8478 /* fall through */
8479 case 3: /* ignore */
8480 p = collend;
8481 break;
8482 case 4: /* xmlcharrefreplace */
8483 /* generate replacement (temporarily (mis)uses p) */
8484 for (p = collstart; p < collend; ++p)
8485 output += sprintf(output, "&#%d;", (int)*p);
8486 p = collend;
8487 break;
8488 default:
8489 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8490 encoding, reason, s, length, &exc,
8491 collstart-s, collend-s, &newpos);
8492 if (repunicode == NULL)
8493 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008494 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008495 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008496 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8497 Py_DECREF(repunicode);
8498 goto onError;
8499 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 /* generate replacement */
8501 repsize = PyUnicode_GET_SIZE(repunicode);
8502 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8503 Py_UNICODE ch = *uni2;
8504 if (Py_UNICODE_ISSPACE(ch))
8505 *output++ = ' ';
8506 else {
8507 decimal = Py_UNICODE_TODECIMAL(ch);
8508 if (decimal >= 0)
8509 *output++ = '0' + decimal;
8510 else if (0 < ch && ch < 256)
8511 *output++ = (char)ch;
8512 else {
8513 Py_DECREF(repunicode);
8514 raise_encode_exception(&exc, encoding,
8515 s, length, collstart-s, collend-s, reason);
8516 goto onError;
8517 }
8518 }
8519 }
8520 p = s + newpos;
8521 Py_DECREF(repunicode);
8522 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008523 }
8524 /* 0-terminate the output string */
8525 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008526 Py_XDECREF(exc);
8527 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008528 return 0;
8529
Benjamin Peterson29060642009-01-31 22:14:21 +00008530 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 Py_XDECREF(exc);
8532 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008533 return -1;
8534}
8535
Guido van Rossumd57fd912000-03-10 22:53:23 +00008536/* --- Helpers ------------------------------------------------------------ */
8537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008539any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540 Py_ssize_t start,
8541 Py_ssize_t end)
8542{
8543 int kind1, kind2, kind;
8544 void *buf1, *buf2;
8545 Py_ssize_t len1, len2, result;
8546
8547 kind1 = PyUnicode_KIND(s1);
8548 kind2 = PyUnicode_KIND(s2);
8549 kind = kind1 > kind2 ? kind1 : kind2;
8550 buf1 = PyUnicode_DATA(s1);
8551 buf2 = PyUnicode_DATA(s2);
8552 if (kind1 != kind)
8553 buf1 = _PyUnicode_AsKind(s1, kind);
8554 if (!buf1)
8555 return -2;
8556 if (kind2 != kind)
8557 buf2 = _PyUnicode_AsKind(s2, kind);
8558 if (!buf2) {
8559 if (kind1 != kind) PyMem_Free(buf1);
8560 return -2;
8561 }
8562 len1 = PyUnicode_GET_LENGTH(s1);
8563 len2 = PyUnicode_GET_LENGTH(s2);
8564
Victor Stinner794d5672011-10-10 03:21:36 +02008565 if (direction > 0) {
8566 switch(kind) {
8567 case PyUnicode_1BYTE_KIND:
8568 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8569 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8570 else
8571 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8572 break;
8573 case PyUnicode_2BYTE_KIND:
8574 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8575 break;
8576 case PyUnicode_4BYTE_KIND:
8577 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8578 break;
8579 default:
8580 assert(0); result = -2;
8581 }
8582 }
8583 else {
8584 switch(kind) {
8585 case PyUnicode_1BYTE_KIND:
8586 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8587 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8588 else
8589 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8590 break;
8591 case PyUnicode_2BYTE_KIND:
8592 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8593 break;
8594 case PyUnicode_4BYTE_KIND:
8595 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8596 break;
8597 default:
8598 assert(0); result = -2;
8599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008600 }
8601
8602 if (kind1 != kind)
8603 PyMem_Free(buf1);
8604 if (kind2 != kind)
8605 PyMem_Free(buf2);
8606
8607 return result;
8608}
8609
8610Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008611_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 Py_ssize_t n_buffer,
8613 void *digits, Py_ssize_t n_digits,
8614 Py_ssize_t min_width,
8615 const char *grouping,
8616 const char *thousands_sep)
8617{
8618 switch(kind) {
8619 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008620 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8621 return _PyUnicode_ascii_InsertThousandsGrouping(
8622 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8623 min_width, grouping, thousands_sep);
8624 else
8625 return _PyUnicode_ucs1_InsertThousandsGrouping(
8626 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8627 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 case PyUnicode_2BYTE_KIND:
8629 return _PyUnicode_ucs2_InsertThousandsGrouping(
8630 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8631 min_width, grouping, thousands_sep);
8632 case PyUnicode_4BYTE_KIND:
8633 return _PyUnicode_ucs4_InsertThousandsGrouping(
8634 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8635 min_width, grouping, thousands_sep);
8636 }
8637 assert(0);
8638 return -1;
8639}
8640
8641
Thomas Wouters477c8d52006-05-27 19:21:47 +00008642/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008643#define ADJUST_INDICES(start, end, len) \
8644 if (end > len) \
8645 end = len; \
8646 else if (end < 0) { \
8647 end += len; \
8648 if (end < 0) \
8649 end = 0; \
8650 } \
8651 if (start < 0) { \
8652 start += len; \
8653 if (start < 0) \
8654 start = 0; \
8655 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008656
Alexander Belopolsky40018472011-02-26 01:02:56 +00008657Py_ssize_t
8658PyUnicode_Count(PyObject *str,
8659 PyObject *substr,
8660 Py_ssize_t start,
8661 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008663 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008664 PyUnicodeObject* str_obj;
8665 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 int kind1, kind2, kind;
8667 void *buf1 = NULL, *buf2 = NULL;
8668 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008669
Thomas Wouters477c8d52006-05-27 19:21:47 +00008670 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008673 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008674 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 Py_DECREF(str_obj);
8676 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 }
Tim Petersced69f82003-09-16 20:30:58 +00008678
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 kind1 = PyUnicode_KIND(str_obj);
8680 kind2 = PyUnicode_KIND(sub_obj);
8681 kind = kind1 > kind2 ? kind1 : kind2;
8682 buf1 = PyUnicode_DATA(str_obj);
8683 if (kind1 != kind)
8684 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8685 if (!buf1)
8686 goto onError;
8687 buf2 = PyUnicode_DATA(sub_obj);
8688 if (kind2 != kind)
8689 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8690 if (!buf2)
8691 goto onError;
8692 len1 = PyUnicode_GET_LENGTH(str_obj);
8693 len2 = PyUnicode_GET_LENGTH(sub_obj);
8694
8695 ADJUST_INDICES(start, end, len1);
8696 switch(kind) {
8697 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008698 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8699 result = asciilib_count(
8700 ((Py_UCS1*)buf1) + start, end - start,
8701 buf2, len2, PY_SSIZE_T_MAX
8702 );
8703 else
8704 result = ucs1lib_count(
8705 ((Py_UCS1*)buf1) + start, end - start,
8706 buf2, len2, PY_SSIZE_T_MAX
8707 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 break;
8709 case PyUnicode_2BYTE_KIND:
8710 result = ucs2lib_count(
8711 ((Py_UCS2*)buf1) + start, end - start,
8712 buf2, len2, PY_SSIZE_T_MAX
8713 );
8714 break;
8715 case PyUnicode_4BYTE_KIND:
8716 result = ucs4lib_count(
8717 ((Py_UCS4*)buf1) + start, end - start,
8718 buf2, len2, PY_SSIZE_T_MAX
8719 );
8720 break;
8721 default:
8722 assert(0); result = 0;
8723 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008724
8725 Py_DECREF(sub_obj);
8726 Py_DECREF(str_obj);
8727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008728 if (kind1 != kind)
8729 PyMem_Free(buf1);
8730 if (kind2 != kind)
8731 PyMem_Free(buf2);
8732
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 onError:
8735 Py_DECREF(sub_obj);
8736 Py_DECREF(str_obj);
8737 if (kind1 != kind && buf1)
8738 PyMem_Free(buf1);
8739 if (kind2 != kind && buf2)
8740 PyMem_Free(buf2);
8741 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742}
8743
Alexander Belopolsky40018472011-02-26 01:02:56 +00008744Py_ssize_t
8745PyUnicode_Find(PyObject *str,
8746 PyObject *sub,
8747 Py_ssize_t start,
8748 Py_ssize_t end,
8749 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008751 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008752
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008756 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008757 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 Py_DECREF(str);
8759 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 }
Tim Petersced69f82003-09-16 20:30:58 +00008761
Victor Stinner794d5672011-10-10 03:21:36 +02008762 result = any_find_slice(direction,
8763 str, sub, start, end
8764 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008765
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008767 Py_DECREF(sub);
8768
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 return result;
8770}
8771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772Py_ssize_t
8773PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8774 Py_ssize_t start, Py_ssize_t end,
8775 int direction)
8776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008778 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779 if (PyUnicode_READY(str) == -1)
8780 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008781 if (start < 0 || end < 0) {
8782 PyErr_SetString(PyExc_IndexError, "string index out of range");
8783 return -2;
8784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 if (end > PyUnicode_GET_LENGTH(str))
8786 end = PyUnicode_GET_LENGTH(str);
8787 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008788 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8789 kind, end-start, ch, direction);
8790 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008792 else
8793 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794}
8795
Alexander Belopolsky40018472011-02-26 01:02:56 +00008796static int
8797tailmatch(PyUnicodeObject *self,
8798 PyUnicodeObject *substring,
8799 Py_ssize_t start,
8800 Py_ssize_t end,
8801 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 int kind_self;
8804 int kind_sub;
8805 void *data_self;
8806 void *data_sub;
8807 Py_ssize_t offset;
8808 Py_ssize_t i;
8809 Py_ssize_t end_sub;
8810
8811 if (PyUnicode_READY(self) == -1 ||
8812 PyUnicode_READY(substring) == -1)
8813 return 0;
8814
8815 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008816 return 1;
8817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008818 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8819 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008820 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823 kind_self = PyUnicode_KIND(self);
8824 data_self = PyUnicode_DATA(self);
8825 kind_sub = PyUnicode_KIND(substring);
8826 data_sub = PyUnicode_DATA(substring);
8827 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8828
8829 if (direction > 0)
8830 offset = end;
8831 else
8832 offset = start;
8833
8834 if (PyUnicode_READ(kind_self, data_self, offset) ==
8835 PyUnicode_READ(kind_sub, data_sub, 0) &&
8836 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8837 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8838 /* If both are of the same kind, memcmp is sufficient */
8839 if (kind_self == kind_sub) {
8840 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008841 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 data_sub,
8843 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008844 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 }
8846 /* otherwise we have to compare each character by first accesing it */
8847 else {
8848 /* We do not need to compare 0 and len(substring)-1 because
8849 the if statement above ensured already that they are equal
8850 when we end up here. */
8851 // TODO: honor direction and do a forward or backwards search
8852 for (i = 1; i < end_sub; ++i) {
8853 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8854 PyUnicode_READ(kind_sub, data_sub, i))
8855 return 0;
8856 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008857 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008859 }
8860
8861 return 0;
8862}
8863
Alexander Belopolsky40018472011-02-26 01:02:56 +00008864Py_ssize_t
8865PyUnicode_Tailmatch(PyObject *str,
8866 PyObject *substr,
8867 Py_ssize_t start,
8868 Py_ssize_t end,
8869 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008870{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008871 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008872
Guido van Rossumd57fd912000-03-10 22:53:23 +00008873 str = PyUnicode_FromObject(str);
8874 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876 substr = PyUnicode_FromObject(substr);
8877 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 Py_DECREF(str);
8879 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008880 }
Tim Petersced69f82003-09-16 20:30:58 +00008881
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 (PyUnicodeObject *)substr,
8884 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008885 Py_DECREF(str);
8886 Py_DECREF(substr);
8887 return result;
8888}
8889
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890/* Apply fixfct filter to the Unicode object self and return a
8891 reference to the modified object */
8892
Alexander Belopolsky40018472011-02-26 01:02:56 +00008893static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008894fixup(PyObject *self,
8895 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008897 PyObject *u;
8898 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 if (PyUnicode_READY(self) == -1)
8901 return NULL;
8902 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8903 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8904 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008909 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 /* fix functions return the new maximum character in a string,
8912 if the kind of the resulting unicode object does not change,
8913 everything is fine. Otherwise we need to change the string kind
8914 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008915 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 if (maxchar_new == 0)
8917 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8918 else if (maxchar_new <= 127)
8919 maxchar_new = 127;
8920 else if (maxchar_new <= 255)
8921 maxchar_new = 255;
8922 else if (maxchar_new <= 65535)
8923 maxchar_new = 65535;
8924 else
8925 maxchar_new = 1114111; /* 0x10ffff */
8926
8927 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008928 /* fixfct should return TRUE if it modified the buffer. If
8929 FALSE, return a reference to the original buffer instead
8930 (to save space, not time) */
8931 Py_INCREF(self);
8932 Py_DECREF(u);
8933 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935 else if (maxchar_new == maxchar_old) {
8936 return u;
8937 }
8938 else {
8939 /* In case the maximum character changed, we need to
8940 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008941 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 if (v == NULL) {
8943 Py_DECREF(u);
8944 return NULL;
8945 }
8946 if (maxchar_new > maxchar_old) {
8947 /* If the maxchar increased so that the kind changed, not all
8948 characters are representable anymore and we need to fix the
8949 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008950 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008951 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8953 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008954 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008955 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957
8958 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008959 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 return v;
8961 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962}
8963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008965fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 /* No need to call PyUnicode_READY(self) because this function is only
8968 called as a callback from fixup() which does it already. */
8969 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8970 const int kind = PyUnicode_KIND(self);
8971 void *data = PyUnicode_DATA(self);
8972 int touched = 0;
8973 Py_UCS4 maxchar = 0;
8974 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 for (i = 0; i < len; ++i) {
8977 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8978 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8979 if (up != ch) {
8980 if (up > maxchar)
8981 maxchar = up;
8982 PyUnicode_WRITE(kind, data, i, up);
8983 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 else if (ch > maxchar)
8986 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008987 }
8988
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008989 if (touched)
8990 return maxchar;
8991 else
8992 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993}
8994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008996fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8999 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9000 const int kind = PyUnicode_KIND(self);
9001 void *data = PyUnicode_DATA(self);
9002 int touched = 0;
9003 Py_UCS4 maxchar = 0;
9004 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 for(i = 0; i < len; ++i) {
9007 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9008 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9009 if (lo != ch) {
9010 if (lo > maxchar)
9011 maxchar = lo;
9012 PyUnicode_WRITE(kind, data, i, lo);
9013 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 else if (ch > maxchar)
9016 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 }
9018
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 if (touched)
9020 return maxchar;
9021 else
9022 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023}
9024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009026fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9029 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9030 const int kind = PyUnicode_KIND(self);
9031 void *data = PyUnicode_DATA(self);
9032 int touched = 0;
9033 Py_UCS4 maxchar = 0;
9034 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 for(i = 0; i < len; ++i) {
9037 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9038 Py_UCS4 nu = 0;
9039
9040 if (Py_UNICODE_ISUPPER(ch))
9041 nu = Py_UNICODE_TOLOWER(ch);
9042 else if (Py_UNICODE_ISLOWER(ch))
9043 nu = Py_UNICODE_TOUPPER(ch);
9044
9045 if (nu != 0) {
9046 if (nu > maxchar)
9047 maxchar = nu;
9048 PyUnicode_WRITE(kind, data, i, nu);
9049 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009050 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009051 else if (ch > maxchar)
9052 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009053 }
9054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009055 if (touched)
9056 return maxchar;
9057 else
9058 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059}
9060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009062fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9065 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9066 const int kind = PyUnicode_KIND(self);
9067 void *data = PyUnicode_DATA(self);
9068 int touched = 0;
9069 Py_UCS4 maxchar = 0;
9070 Py_ssize_t i = 0;
9071 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009072
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009073 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075
9076 ch = PyUnicode_READ(kind, data, i);
9077 if (!Py_UNICODE_ISUPPER(ch)) {
9078 maxchar = Py_UNICODE_TOUPPER(ch);
9079 PyUnicode_WRITE(kind, data, i, maxchar);
9080 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009081 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 ++i;
9083 for(; i < len; ++i) {
9084 ch = PyUnicode_READ(kind, data, i);
9085 if (!Py_UNICODE_ISLOWER(ch)) {
9086 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9087 if (lo > maxchar)
9088 maxchar = lo;
9089 PyUnicode_WRITE(kind, data, i, lo);
9090 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 else if (ch > maxchar)
9093 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095
9096 if (touched)
9097 return maxchar;
9098 else
9099 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100}
9101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009103fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9106 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9107 const int kind = PyUnicode_KIND(self);
9108 void *data = PyUnicode_DATA(self);
9109 Py_UCS4 maxchar = 0;
9110 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009111 int previous_is_cased;
9112
9113 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009114 if (len == 1) {
9115 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9116 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9117 if (ti != ch) {
9118 PyUnicode_WRITE(kind, data, i, ti);
9119 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 }
9121 else
9122 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009124 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 for(; i < len; ++i) {
9126 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9127 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009128
Benjamin Peterson29060642009-01-31 22:14:21 +00009129 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132 nu = Py_UNICODE_TOTITLE(ch);
9133
9134 if (nu > maxchar)
9135 maxchar = nu;
9136 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009137
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 if (Py_UNICODE_ISLOWER(ch) ||
9139 Py_UNICODE_ISUPPER(ch) ||
9140 Py_UNICODE_ISTITLE(ch))
9141 previous_is_cased = 1;
9142 else
9143 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146}
9147
Tim Peters8ce9f162004-08-27 01:49:32 +00009148PyObject *
9149PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009152 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009154 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009155 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9156 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009157 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009159 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009160 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009161 int use_memcpy;
9162 unsigned char *res_data = NULL, *sep_data = NULL;
9163 PyObject *last_obj;
9164 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009165
Tim Peters05eba1f2004-08-27 21:32:02 +00009166 fseq = PySequence_Fast(seq, "");
9167 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009168 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009169 }
9170
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009171 /* NOTE: the following code can't call back into Python code,
9172 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009173 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009174
Tim Peters05eba1f2004-08-27 21:32:02 +00009175 seqlen = PySequence_Fast_GET_SIZE(fseq);
9176 /* If empty sequence, return u"". */
9177 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009178 Py_DECREF(fseq);
9179 Py_INCREF(unicode_empty);
9180 res = unicode_empty;
9181 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009182 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009183
Tim Peters05eba1f2004-08-27 21:32:02 +00009184 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009185 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009186 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009187 if (seqlen == 1) {
9188 if (PyUnicode_CheckExact(items[0])) {
9189 res = items[0];
9190 Py_INCREF(res);
9191 Py_DECREF(fseq);
9192 return res;
9193 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009194 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009195 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009196 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009197 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009198 /* Set up sep and seplen */
9199 if (separator == NULL) {
9200 /* fall back to a blank space separator */
9201 sep = PyUnicode_FromOrdinal(' ');
9202 if (!sep)
9203 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009204 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009205 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009206 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009207 else {
9208 if (!PyUnicode_Check(separator)) {
9209 PyErr_Format(PyExc_TypeError,
9210 "separator: expected str instance,"
9211 " %.80s found",
9212 Py_TYPE(separator)->tp_name);
9213 goto onError;
9214 }
9215 if (PyUnicode_READY(separator))
9216 goto onError;
9217 sep = separator;
9218 seplen = PyUnicode_GET_LENGTH(separator);
9219 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9220 /* inc refcount to keep this code path symmetric with the
9221 above case of a blank separator */
9222 Py_INCREF(sep);
9223 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009224 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009225 }
9226
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009227 /* There are at least two things to join, or else we have a subclass
9228 * of str in the sequence.
9229 * Do a pre-pass to figure out the total amount of space we'll
9230 * need (sz), and see whether all argument are strings.
9231 */
9232 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009233#ifdef Py_DEBUG
9234 use_memcpy = 0;
9235#else
9236 use_memcpy = 1;
9237#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009238 for (i = 0; i < seqlen; i++) {
9239 const Py_ssize_t old_sz = sz;
9240 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009241 if (!PyUnicode_Check(item)) {
9242 PyErr_Format(PyExc_TypeError,
9243 "sequence item %zd: expected str instance,"
9244 " %.80s found",
9245 i, Py_TYPE(item)->tp_name);
9246 goto onError;
9247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (PyUnicode_READY(item) == -1)
9249 goto onError;
9250 sz += PyUnicode_GET_LENGTH(item);
9251 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009252 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009253 if (i != 0)
9254 sz += seplen;
9255 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9256 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009257 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009258 goto onError;
9259 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009260 if (use_memcpy && last_obj != NULL) {
9261 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9262 use_memcpy = 0;
9263 }
9264 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009265 }
Tim Petersced69f82003-09-16 20:30:58 +00009266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009268 if (res == NULL)
9269 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009270
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009271 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009272#ifdef Py_DEBUG
9273 use_memcpy = 0;
9274#else
9275 if (use_memcpy) {
9276 res_data = PyUnicode_1BYTE_DATA(res);
9277 kind = PyUnicode_KIND(res);
9278 if (seplen != 0)
9279 sep_data = PyUnicode_1BYTE_DATA(sep);
9280 }
9281#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009283 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009284 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009285 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009286 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009287 if (use_memcpy) {
9288 Py_MEMCPY(res_data,
9289 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009290 kind * seplen);
9291 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009292 }
9293 else {
9294 copy_characters(res, res_offset, sep, 0, seplen);
9295 res_offset += seplen;
9296 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009297 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009298 itemlen = PyUnicode_GET_LENGTH(item);
9299 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009300 if (use_memcpy) {
9301 Py_MEMCPY(res_data,
9302 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009303 kind * itemlen);
9304 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009305 }
9306 else {
9307 copy_characters(res, res_offset, item, 0, itemlen);
9308 res_offset += itemlen;
9309 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009310 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009311 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009312 if (use_memcpy)
9313 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009314 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009315 else
9316 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009317
Tim Peters05eba1f2004-08-27 21:32:02 +00009318 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009320 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322
Benjamin Peterson29060642009-01-31 22:14:21 +00009323 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009324 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009326 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009327 return NULL;
9328}
9329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009330#define FILL(kind, data, value, start, length) \
9331 do { \
9332 Py_ssize_t i_ = 0; \
9333 assert(kind != PyUnicode_WCHAR_KIND); \
9334 switch ((kind)) { \
9335 case PyUnicode_1BYTE_KIND: { \
9336 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9337 memset(to_, (unsigned char)value, length); \
9338 break; \
9339 } \
9340 case PyUnicode_2BYTE_KIND: { \
9341 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9342 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9343 break; \
9344 } \
9345 default: { \
9346 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9347 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9348 break; \
9349 } \
9350 } \
9351 } while (0)
9352
Victor Stinner9310abb2011-10-05 00:59:23 +02009353static PyObject *
9354pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009355 Py_ssize_t left,
9356 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 PyObject *u;
9360 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009361 int kind;
9362 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009363
9364 if (left < 0)
9365 left = 0;
9366 if (right < 0)
9367 right = 0;
9368
Tim Peters7a29bd52001-09-12 03:03:31 +00009369 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 Py_INCREF(self);
9371 return self;
9372 }
9373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9375 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009376 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9377 return NULL;
9378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9380 if (fill > maxchar)
9381 maxchar = fill;
9382 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009383 if (!u)
9384 return NULL;
9385
9386 kind = PyUnicode_KIND(u);
9387 data = PyUnicode_DATA(u);
9388 if (left)
9389 FILL(kind, data, fill, 0, left);
9390 if (right)
9391 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009392 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009393 assert(_PyUnicode_CheckConsistency(u, 1));
9394 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397
Alexander Belopolsky40018472011-02-26 01:02:56 +00009398PyObject *
9399PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009400{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009402
9403 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009407 switch(PyUnicode_KIND(string)) {
9408 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009409 if (PyUnicode_IS_ASCII(string))
9410 list = asciilib_splitlines(
9411 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9412 PyUnicode_GET_LENGTH(string), keepends);
9413 else
9414 list = ucs1lib_splitlines(
9415 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9416 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417 break;
9418 case PyUnicode_2BYTE_KIND:
9419 list = ucs2lib_splitlines(
9420 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9421 PyUnicode_GET_LENGTH(string), keepends);
9422 break;
9423 case PyUnicode_4BYTE_KIND:
9424 list = ucs4lib_splitlines(
9425 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9426 PyUnicode_GET_LENGTH(string), keepends);
9427 break;
9428 default:
9429 assert(0);
9430 list = 0;
9431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009432 Py_DECREF(string);
9433 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009434}
9435
Alexander Belopolsky40018472011-02-26 01:02:56 +00009436static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009437split(PyObject *self,
9438 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009439 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 int kind1, kind2, kind;
9442 void *buf1, *buf2;
9443 Py_ssize_t len1, len2;
9444 PyObject* out;
9445
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009447 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (PyUnicode_READY(self) == -1)
9450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 if (substring == NULL)
9453 switch(PyUnicode_KIND(self)) {
9454 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009455 if (PyUnicode_IS_ASCII(self))
9456 return asciilib_split_whitespace(
9457 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9458 PyUnicode_GET_LENGTH(self), maxcount
9459 );
9460 else
9461 return ucs1lib_split_whitespace(
9462 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9463 PyUnicode_GET_LENGTH(self), maxcount
9464 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 case PyUnicode_2BYTE_KIND:
9466 return ucs2lib_split_whitespace(
9467 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9468 PyUnicode_GET_LENGTH(self), maxcount
9469 );
9470 case PyUnicode_4BYTE_KIND:
9471 return ucs4lib_split_whitespace(
9472 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9473 PyUnicode_GET_LENGTH(self), maxcount
9474 );
9475 default:
9476 assert(0);
9477 return NULL;
9478 }
9479
9480 if (PyUnicode_READY(substring) == -1)
9481 return NULL;
9482
9483 kind1 = PyUnicode_KIND(self);
9484 kind2 = PyUnicode_KIND(substring);
9485 kind = kind1 > kind2 ? kind1 : kind2;
9486 buf1 = PyUnicode_DATA(self);
9487 buf2 = PyUnicode_DATA(substring);
9488 if (kind1 != kind)
9489 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9490 if (!buf1)
9491 return NULL;
9492 if (kind2 != kind)
9493 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9494 if (!buf2) {
9495 if (kind1 != kind) PyMem_Free(buf1);
9496 return NULL;
9497 }
9498 len1 = PyUnicode_GET_LENGTH(self);
9499 len2 = PyUnicode_GET_LENGTH(substring);
9500
9501 switch(kind) {
9502 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009503 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9504 out = asciilib_split(
9505 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9506 else
9507 out = ucs1lib_split(
9508 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 break;
9510 case PyUnicode_2BYTE_KIND:
9511 out = ucs2lib_split(
9512 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9513 break;
9514 case PyUnicode_4BYTE_KIND:
9515 out = ucs4lib_split(
9516 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9517 break;
9518 default:
9519 out = NULL;
9520 }
9521 if (kind1 != kind)
9522 PyMem_Free(buf1);
9523 if (kind2 != kind)
9524 PyMem_Free(buf2);
9525 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526}
9527
Alexander Belopolsky40018472011-02-26 01:02:56 +00009528static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009529rsplit(PyObject *self,
9530 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009531 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009533 int kind1, kind2, kind;
9534 void *buf1, *buf2;
9535 Py_ssize_t len1, len2;
9536 PyObject* out;
9537
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009538 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009539 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 if (PyUnicode_READY(self) == -1)
9542 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009544 if (substring == NULL)
9545 switch(PyUnicode_KIND(self)) {
9546 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009547 if (PyUnicode_IS_ASCII(self))
9548 return asciilib_rsplit_whitespace(
9549 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9550 PyUnicode_GET_LENGTH(self), maxcount
9551 );
9552 else
9553 return ucs1lib_rsplit_whitespace(
9554 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9555 PyUnicode_GET_LENGTH(self), maxcount
9556 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009557 case PyUnicode_2BYTE_KIND:
9558 return ucs2lib_rsplit_whitespace(
9559 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9560 PyUnicode_GET_LENGTH(self), maxcount
9561 );
9562 case PyUnicode_4BYTE_KIND:
9563 return ucs4lib_rsplit_whitespace(
9564 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9565 PyUnicode_GET_LENGTH(self), maxcount
9566 );
9567 default:
9568 assert(0);
9569 return NULL;
9570 }
9571
9572 if (PyUnicode_READY(substring) == -1)
9573 return NULL;
9574
9575 kind1 = PyUnicode_KIND(self);
9576 kind2 = PyUnicode_KIND(substring);
9577 kind = kind1 > kind2 ? kind1 : kind2;
9578 buf1 = PyUnicode_DATA(self);
9579 buf2 = PyUnicode_DATA(substring);
9580 if (kind1 != kind)
9581 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9582 if (!buf1)
9583 return NULL;
9584 if (kind2 != kind)
9585 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9586 if (!buf2) {
9587 if (kind1 != kind) PyMem_Free(buf1);
9588 return NULL;
9589 }
9590 len1 = PyUnicode_GET_LENGTH(self);
9591 len2 = PyUnicode_GET_LENGTH(substring);
9592
9593 switch(kind) {
9594 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009595 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9596 out = asciilib_rsplit(
9597 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9598 else
9599 out = ucs1lib_rsplit(
9600 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 break;
9602 case PyUnicode_2BYTE_KIND:
9603 out = ucs2lib_rsplit(
9604 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9605 break;
9606 case PyUnicode_4BYTE_KIND:
9607 out = ucs4lib_rsplit(
9608 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9609 break;
9610 default:
9611 out = NULL;
9612 }
9613 if (kind1 != kind)
9614 PyMem_Free(buf1);
9615 if (kind2 != kind)
9616 PyMem_Free(buf2);
9617 return out;
9618}
9619
9620static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009621anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9622 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623{
9624 switch(kind) {
9625 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009626 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9627 return asciilib_find(buf1, len1, buf2, len2, offset);
9628 else
9629 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 case PyUnicode_2BYTE_KIND:
9631 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9632 case PyUnicode_4BYTE_KIND:
9633 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9634 }
9635 assert(0);
9636 return -1;
9637}
9638
9639static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009640anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9641 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642{
9643 switch(kind) {
9644 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009645 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9646 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9647 else
9648 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 case PyUnicode_2BYTE_KIND:
9650 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9651 case PyUnicode_4BYTE_KIND:
9652 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9653 }
9654 assert(0);
9655 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009656}
9657
Alexander Belopolsky40018472011-02-26 01:02:56 +00009658static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659replace(PyObject *self, PyObject *str1,
9660 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 PyObject *u;
9663 char *sbuf = PyUnicode_DATA(self);
9664 char *buf1 = PyUnicode_DATA(str1);
9665 char *buf2 = PyUnicode_DATA(str2);
9666 int srelease = 0, release1 = 0, release2 = 0;
9667 int skind = PyUnicode_KIND(self);
9668 int kind1 = PyUnicode_KIND(str1);
9669 int kind2 = PyUnicode_KIND(str2);
9670 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9671 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9672 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009673 int mayshrink;
9674 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675
9676 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009677 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009679 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680
Victor Stinner59de0ee2011-10-07 10:01:28 +02009681 if (str1 == str2)
9682 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 if (skind < kind1)
9684 /* substring too wide to be present */
9685 goto nothing;
9686
Victor Stinner49a0a212011-10-12 23:46:10 +02009687 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9688 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9689 /* Replacing str1 with str2 may cause a maxchar reduction in the
9690 result string. */
9691 mayshrink = (maxchar_str2 < maxchar);
9692 maxchar = Py_MAX(maxchar, maxchar_str2);
9693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009694 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009695 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009696 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009698 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009699 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009700 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009701 Py_UCS4 u1, u2;
9702 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009704 if (findchar(sbuf, PyUnicode_KIND(self),
9705 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009706 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009709 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009711 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 rkind = PyUnicode_KIND(u);
9713 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9714 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 if (--maxcount < 0)
9716 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009718 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009719 }
9720 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 int rkind = skind;
9722 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009724 if (kind1 < rkind) {
9725 /* widen substring */
9726 buf1 = _PyUnicode_AsKind(str1, rkind);
9727 if (!buf1) goto error;
9728 release1 = 1;
9729 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009730 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009731 if (i < 0)
9732 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009733 if (rkind > kind2) {
9734 /* widen replacement */
9735 buf2 = _PyUnicode_AsKind(str2, rkind);
9736 if (!buf2) goto error;
9737 release2 = 1;
9738 }
9739 else if (rkind < kind2) {
9740 /* widen self and buf1 */
9741 rkind = kind2;
9742 if (release1) PyMem_Free(buf1);
9743 sbuf = _PyUnicode_AsKind(self, rkind);
9744 if (!sbuf) goto error;
9745 srelease = 1;
9746 buf1 = _PyUnicode_AsKind(str1, rkind);
9747 if (!buf1) goto error;
9748 release1 = 1;
9749 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009750 u = PyUnicode_New(slen, maxchar);
9751 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009753 assert(PyUnicode_KIND(u) == rkind);
9754 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009755
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009756 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009757 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009758 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009759 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009760 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009761 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009762
9763 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009764 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009765 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009766 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009767 if (i == -1)
9768 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009769 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009771 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009773 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009775 }
9776 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 Py_ssize_t n, i, j, ires;
9778 Py_ssize_t product, new_size;
9779 int rkind = skind;
9780 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009783 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 buf1 = _PyUnicode_AsKind(str1, rkind);
9785 if (!buf1) goto error;
9786 release1 = 1;
9787 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009788 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009789 if (n == 0)
9790 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009791 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009792 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793 buf2 = _PyUnicode_AsKind(str2, rkind);
9794 if (!buf2) goto error;
9795 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009798 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 rkind = kind2;
9800 sbuf = _PyUnicode_AsKind(self, rkind);
9801 if (!sbuf) goto error;
9802 srelease = 1;
9803 if (release1) PyMem_Free(buf1);
9804 buf1 = _PyUnicode_AsKind(str1, rkind);
9805 if (!buf1) goto error;
9806 release1 = 1;
9807 }
9808 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9809 PyUnicode_GET_LENGTH(str1))); */
9810 product = n * (len2-len1);
9811 if ((product / (len2-len1)) != n) {
9812 PyErr_SetString(PyExc_OverflowError,
9813 "replace string is too long");
9814 goto error;
9815 }
9816 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +02009817 if (new_size == 0) {
9818 Py_INCREF(unicode_empty);
9819 u = unicode_empty;
9820 goto done;
9821 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9823 PyErr_SetString(PyExc_OverflowError,
9824 "replace string is too long");
9825 goto error;
9826 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009827 u = PyUnicode_New(new_size, maxchar);
9828 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009829 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009830 assert(PyUnicode_KIND(u) == rkind);
9831 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009832 ires = i = 0;
9833 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009834 while (n-- > 0) {
9835 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009836 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009837 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009838 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009839 if (j == -1)
9840 break;
9841 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009842 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009843 memcpy(res + rkind * ires,
9844 sbuf + rkind * i,
9845 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009847 }
9848 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009850 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009851 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009852 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009853 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009856 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009858 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009859 memcpy(res + rkind * ires,
9860 sbuf + rkind * i,
9861 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +02009862 }
9863 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009864 /* interleave */
9865 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009866 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009868 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009869 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009870 if (--n <= 0)
9871 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009872 memcpy(res + rkind * ires,
9873 sbuf + rkind * i,
9874 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 ires++;
9876 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009877 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009878 memcpy(res + rkind * ires,
9879 sbuf + rkind * i,
9880 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009881 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009882 }
9883
9884 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009885 unicode_adjust_maxchar(&u);
9886 if (u == NULL)
9887 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009889
9890 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 if (srelease)
9892 PyMem_FREE(sbuf);
9893 if (release1)
9894 PyMem_FREE(buf1);
9895 if (release2)
9896 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009897 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009898 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009899
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009901 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009902 if (srelease)
9903 PyMem_FREE(sbuf);
9904 if (release1)
9905 PyMem_FREE(buf1);
9906 if (release2)
9907 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009908 if (PyUnicode_CheckExact(self)) {
9909 Py_INCREF(self);
9910 return (PyObject *) self;
9911 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009912 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 error:
9914 if (srelease && sbuf)
9915 PyMem_FREE(sbuf);
9916 if (release1 && buf1)
9917 PyMem_FREE(buf1);
9918 if (release2 && buf2)
9919 PyMem_FREE(buf2);
9920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921}
9922
9923/* --- Unicode Object Methods --------------------------------------------- */
9924
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009925PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009926 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927\n\
9928Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009929characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930
9931static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009932unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009934 return fixup(self, fixtitle);
9935}
9936
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009937PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009938 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939\n\
9940Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009941have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942
9943static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009944unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946 return fixup(self, fixcapitalize);
9947}
9948
9949#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009950PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009951 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952\n\
9953Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009954normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009955
9956static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009957unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958{
9959 PyObject *list;
9960 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009961 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963 /* Split into words */
9964 list = split(self, NULL, -1);
9965 if (!list)
9966 return NULL;
9967
9968 /* Capitalize each word */
9969 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9970 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009971 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972 if (item == NULL)
9973 goto onError;
9974 Py_DECREF(PyList_GET_ITEM(list, i));
9975 PyList_SET_ITEM(list, i, item);
9976 }
9977
9978 /* Join the words to form a new string */
9979 item = PyUnicode_Join(NULL, list);
9980
Benjamin Peterson29060642009-01-31 22:14:21 +00009981 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009982 Py_DECREF(list);
9983 return (PyObject *)item;
9984}
9985#endif
9986
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009987/* Argument converter. Coerces to a single unicode character */
9988
9989static int
9990convert_uc(PyObject *obj, void *addr)
9991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009993 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009994
Benjamin Peterson14339b62009-01-31 16:36:08 +00009995 uniobj = PyUnicode_FromObject(obj);
9996 if (uniobj == NULL) {
9997 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009998 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009999 return 0;
10000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010002 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010003 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010004 Py_DECREF(uniobj);
10005 return 0;
10006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 Py_DECREF(uniobj);
10009 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010010}
10011
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010012PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010013 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010015Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010016done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017
10018static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010019unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010021 Py_ssize_t marg, left;
10022 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 Py_UCS4 fillchar = ' ';
10024
Victor Stinnere9a29352011-10-01 02:14:59 +020010025 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027
Victor Stinnere9a29352011-10-01 02:14:59 +020010028 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010029 return NULL;
10030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032 Py_INCREF(self);
10033 return (PyObject*) self;
10034 }
10035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037 left = marg / 2 + (marg & width & 1);
10038
Victor Stinner9310abb2011-10-05 00:59:23 +020010039 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040}
10041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042/* This function assumes that str1 and str2 are readied by the caller. */
10043
Marc-André Lemburge5034372000-08-08 08:04:29 +000010044static int
10045unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 int kind1, kind2;
10048 void *data1, *data2;
10049 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 kind1 = PyUnicode_KIND(str1);
10052 kind2 = PyUnicode_KIND(str2);
10053 data1 = PyUnicode_DATA(str1);
10054 data2 = PyUnicode_DATA(str2);
10055 len1 = PyUnicode_GET_LENGTH(str1);
10056 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010057
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010058 for (i = 0; i < len1 && i < len2; ++i) {
10059 Py_UCS4 c1, c2;
10060 c1 = PyUnicode_READ(kind1, data1, i);
10061 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010062
10063 if (c1 != c2)
10064 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010065 }
10066
10067 return (len1 < len2) ? -1 : (len1 != len2);
10068}
10069
Alexander Belopolsky40018472011-02-26 01:02:56 +000010070int
10071PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010072{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10074 if (PyUnicode_READY(left) == -1 ||
10075 PyUnicode_READY(right) == -1)
10076 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010077 return unicode_compare((PyUnicodeObject *)left,
10078 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010080 PyErr_Format(PyExc_TypeError,
10081 "Can't compare %.100s and %.100s",
10082 left->ob_type->tp_name,
10083 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010084 return -1;
10085}
10086
Martin v. Löwis5b222132007-06-10 09:51:05 +000010087int
10088PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10089{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 Py_ssize_t i;
10091 int kind;
10092 void *data;
10093 Py_UCS4 chr;
10094
Victor Stinner910337b2011-10-03 03:20:16 +020010095 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 if (PyUnicode_READY(uni) == -1)
10097 return -1;
10098 kind = PyUnicode_KIND(uni);
10099 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010100 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10102 if (chr != str[i])
10103 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010104 /* This check keeps Python strings that end in '\0' from comparing equal
10105 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010108 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010110 return 0;
10111}
10112
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010113
Benjamin Peterson29060642009-01-31 22:14:21 +000010114#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010115 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010116
Alexander Belopolsky40018472011-02-26 01:02:56 +000010117PyObject *
10118PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010119{
10120 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010122 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10123 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 if (PyUnicode_READY(left) == -1 ||
10125 PyUnicode_READY(right) == -1)
10126 return NULL;
10127 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10128 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010129 if (op == Py_EQ) {
10130 Py_INCREF(Py_False);
10131 return Py_False;
10132 }
10133 if (op == Py_NE) {
10134 Py_INCREF(Py_True);
10135 return Py_True;
10136 }
10137 }
10138 if (left == right)
10139 result = 0;
10140 else
10141 result = unicode_compare((PyUnicodeObject *)left,
10142 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010143
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010144 /* Convert the return value to a Boolean */
10145 switch (op) {
10146 case Py_EQ:
10147 v = TEST_COND(result == 0);
10148 break;
10149 case Py_NE:
10150 v = TEST_COND(result != 0);
10151 break;
10152 case Py_LE:
10153 v = TEST_COND(result <= 0);
10154 break;
10155 case Py_GE:
10156 v = TEST_COND(result >= 0);
10157 break;
10158 case Py_LT:
10159 v = TEST_COND(result == -1);
10160 break;
10161 case Py_GT:
10162 v = TEST_COND(result == 1);
10163 break;
10164 default:
10165 PyErr_BadArgument();
10166 return NULL;
10167 }
10168 Py_INCREF(v);
10169 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010171
Brian Curtindfc80e32011-08-10 20:28:54 -050010172 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010173}
10174
Alexander Belopolsky40018472011-02-26 01:02:56 +000010175int
10176PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010177{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010178 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 int kind1, kind2, kind;
10180 void *buf1, *buf2;
10181 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010182 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010183
10184 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 sub = PyUnicode_FromObject(element);
10186 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 PyErr_Format(PyExc_TypeError,
10188 "'in <string>' requires string as left operand, not %s",
10189 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010190 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 if (PyUnicode_READY(sub) == -1)
10193 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010194
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010196 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010197 Py_DECREF(sub);
10198 return -1;
10199 }
10200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010201 kind1 = PyUnicode_KIND(str);
10202 kind2 = PyUnicode_KIND(sub);
10203 kind = kind1 > kind2 ? kind1 : kind2;
10204 buf1 = PyUnicode_DATA(str);
10205 buf2 = PyUnicode_DATA(sub);
10206 if (kind1 != kind)
10207 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10208 if (!buf1) {
10209 Py_DECREF(sub);
10210 return -1;
10211 }
10212 if (kind2 != kind)
10213 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10214 if (!buf2) {
10215 Py_DECREF(sub);
10216 if (kind1 != kind) PyMem_Free(buf1);
10217 return -1;
10218 }
10219 len1 = PyUnicode_GET_LENGTH(str);
10220 len2 = PyUnicode_GET_LENGTH(sub);
10221
10222 switch(kind) {
10223 case PyUnicode_1BYTE_KIND:
10224 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10225 break;
10226 case PyUnicode_2BYTE_KIND:
10227 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10228 break;
10229 case PyUnicode_4BYTE_KIND:
10230 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10231 break;
10232 default:
10233 result = -1;
10234 assert(0);
10235 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010236
10237 Py_DECREF(str);
10238 Py_DECREF(sub);
10239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 if (kind1 != kind)
10241 PyMem_Free(buf1);
10242 if (kind2 != kind)
10243 PyMem_Free(buf2);
10244
Guido van Rossum403d68b2000-03-13 15:55:09 +000010245 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010246}
10247
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248/* Concat to string or Unicode object giving a new Unicode object. */
10249
Alexander Belopolsky40018472011-02-26 01:02:56 +000010250PyObject *
10251PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010254 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
10256 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010259 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263
10264 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010265 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010266 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010268 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010269 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010270 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010272 }
10273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010275 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10276 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 w = PyUnicode_New(
10280 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10281 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010282 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010283 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010284 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10285 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286 Py_DECREF(u);
10287 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010288 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292 Py_XDECREF(u);
10293 Py_XDECREF(v);
10294 return NULL;
10295}
10296
Victor Stinnerb0923652011-10-04 01:17:31 +020010297static void
10298unicode_append_inplace(PyObject **p_left, PyObject *right)
10299{
10300 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010301
10302 assert(PyUnicode_IS_READY(*p_left));
10303 assert(PyUnicode_IS_READY(right));
10304
10305 left_len = PyUnicode_GET_LENGTH(*p_left);
10306 right_len = PyUnicode_GET_LENGTH(right);
10307 if (left_len > PY_SSIZE_T_MAX - right_len) {
10308 PyErr_SetString(PyExc_OverflowError,
10309 "strings are too large to concat");
10310 goto error;
10311 }
10312 new_len = left_len + right_len;
10313
10314 /* Now we own the last reference to 'left', so we can resize it
10315 * in-place.
10316 */
10317 if (unicode_resize(p_left, new_len) != 0) {
10318 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10319 * deallocated so it cannot be put back into
10320 * 'variable'. The MemoryError is raised when there
10321 * is no value in 'variable', which might (very
10322 * remotely) be a cause of incompatibilities.
10323 */
10324 goto error;
10325 }
10326 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010327 copy_characters(*p_left, left_len, right, 0, right_len);
10328 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010329 return;
10330
10331error:
10332 Py_DECREF(*p_left);
10333 *p_left = NULL;
10334}
10335
Walter Dörwald1ab83302007-05-18 17:15:44 +000010336void
Victor Stinner23e56682011-10-03 03:54:37 +020010337PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010338{
Victor Stinner23e56682011-10-03 03:54:37 +020010339 PyObject *left, *res;
10340
10341 if (p_left == NULL) {
10342 if (!PyErr_Occurred())
10343 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010344 return;
10345 }
Victor Stinner23e56682011-10-03 03:54:37 +020010346 left = *p_left;
10347 if (right == NULL || !PyUnicode_Check(left)) {
10348 if (!PyErr_Occurred())
10349 PyErr_BadInternalCall();
10350 goto error;
10351 }
10352
Victor Stinnere1335c72011-10-04 20:53:03 +020010353 if (PyUnicode_READY(left))
10354 goto error;
10355 if (PyUnicode_READY(right))
10356 goto error;
10357
Victor Stinner23e56682011-10-03 03:54:37 +020010358 if (PyUnicode_CheckExact(left) && left != unicode_empty
10359 && PyUnicode_CheckExact(right) && right != unicode_empty
10360 && unicode_resizable(left)
10361 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10362 || _PyUnicode_WSTR(left) != NULL))
10363 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010364 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10365 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010366 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010367 not so different than duplicating the string. */
10368 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010369 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010370 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010371 if (p_left != NULL)
10372 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010373 return;
10374 }
10375 }
10376
10377 res = PyUnicode_Concat(left, right);
10378 if (res == NULL)
10379 goto error;
10380 Py_DECREF(left);
10381 *p_left = res;
10382 return;
10383
10384error:
10385 Py_DECREF(*p_left);
10386 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010387}
10388
10389void
10390PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10391{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010392 PyUnicode_Append(pleft, right);
10393 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010394}
10395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010396PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010397 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010399Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010400string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010401interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
10403static PyObject *
10404unicode_count(PyUnicodeObject *self, PyObject *args)
10405{
10406 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010407 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010408 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 int kind1, kind2, kind;
10411 void *buf1, *buf2;
10412 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010413
Jesus Ceaac451502011-04-20 17:09:23 +020010414 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10415 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010416 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 kind1 = PyUnicode_KIND(self);
10419 kind2 = PyUnicode_KIND(substring);
10420 kind = kind1 > kind2 ? kind1 : kind2;
10421 buf1 = PyUnicode_DATA(self);
10422 buf2 = PyUnicode_DATA(substring);
10423 if (kind1 != kind)
10424 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10425 if (!buf1) {
10426 Py_DECREF(substring);
10427 return NULL;
10428 }
10429 if (kind2 != kind)
10430 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10431 if (!buf2) {
10432 Py_DECREF(substring);
10433 if (kind1 != kind) PyMem_Free(buf1);
10434 return NULL;
10435 }
10436 len1 = PyUnicode_GET_LENGTH(self);
10437 len2 = PyUnicode_GET_LENGTH(substring);
10438
10439 ADJUST_INDICES(start, end, len1);
10440 switch(kind) {
10441 case PyUnicode_1BYTE_KIND:
10442 iresult = ucs1lib_count(
10443 ((Py_UCS1*)buf1) + start, end - start,
10444 buf2, len2, PY_SSIZE_T_MAX
10445 );
10446 break;
10447 case PyUnicode_2BYTE_KIND:
10448 iresult = ucs2lib_count(
10449 ((Py_UCS2*)buf1) + start, end - start,
10450 buf2, len2, PY_SSIZE_T_MAX
10451 );
10452 break;
10453 case PyUnicode_4BYTE_KIND:
10454 iresult = ucs4lib_count(
10455 ((Py_UCS4*)buf1) + start, end - start,
10456 buf2, len2, PY_SSIZE_T_MAX
10457 );
10458 break;
10459 default:
10460 assert(0); iresult = 0;
10461 }
10462
10463 result = PyLong_FromSsize_t(iresult);
10464
10465 if (kind1 != kind)
10466 PyMem_Free(buf1);
10467 if (kind2 != kind)
10468 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469
10470 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471
Guido van Rossumd57fd912000-03-10 22:53:23 +000010472 return result;
10473}
10474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010475PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010476 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010478Encode S using the codec registered for encoding. Default encoding\n\
10479is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010480handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010481a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10482'xmlcharrefreplace' as well as any other name registered with\n\
10483codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484
10485static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010486unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010488 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489 char *encoding = NULL;
10490 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010491
Benjamin Peterson308d6372009-09-18 21:42:35 +000010492 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10493 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010495 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010496}
10497
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010498PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010499 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500\n\
10501Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010502If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
10504static PyObject*
10505unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10506{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010507 Py_ssize_t i, j, line_pos, src_len, incr;
10508 Py_UCS4 ch;
10509 PyObject *u;
10510 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010512 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010513 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514
10515 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517
Antoine Pitrou22425222011-10-04 19:10:51 +020010518 if (PyUnicode_READY(self) == -1)
10519 return NULL;
10520
Thomas Wouters7e474022000-07-16 12:04:32 +000010521 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010522 src_len = PyUnicode_GET_LENGTH(self);
10523 i = j = line_pos = 0;
10524 kind = PyUnicode_KIND(self);
10525 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010526 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010527 for (; i < src_len; i++) {
10528 ch = PyUnicode_READ(kind, src_data, i);
10529 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010530 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010531 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010532 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010533 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010534 goto overflow;
10535 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010537 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010540 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010541 goto overflow;
10542 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010544 if (ch == '\n' || ch == '\r')
10545 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010547 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010548 if (!found && PyUnicode_CheckExact(self)) {
10549 Py_INCREF((PyObject *) self);
10550 return (PyObject *) self;
10551 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010552
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010554 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010555 if (!u)
10556 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010557 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558
Antoine Pitroue71d5742011-10-04 15:55:09 +020010559 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560
Antoine Pitroue71d5742011-10-04 15:55:09 +020010561 for (; i < src_len; i++) {
10562 ch = PyUnicode_READ(kind, src_data, i);
10563 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010564 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010565 incr = tabsize - (line_pos % tabsize);
10566 line_pos += incr;
10567 while (incr--) {
10568 PyUnicode_WRITE(kind, dest_data, j, ' ');
10569 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010570 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010573 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010574 line_pos++;
10575 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010576 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010577 if (ch == '\n' || ch == '\r')
10578 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010579 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010580 }
10581 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010582#ifndef DONT_MAKE_RESULT_READY
10583 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 Py_DECREF(u);
10585 return NULL;
10586 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010587#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010588 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010590
Antoine Pitroue71d5742011-10-04 15:55:09 +020010591 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010592 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10593 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594}
10595
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010596PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598\n\
10599Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010600such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601arguments start and end are interpreted as in slice notation.\n\
10602\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010603Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
10605static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607{
Jesus Ceaac451502011-04-20 17:09:23 +020010608 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010609 Py_ssize_t start;
10610 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010611 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
Jesus Ceaac451502011-04-20 17:09:23 +020010613 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10614 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (PyUnicode_READY(self) == -1)
10618 return NULL;
10619 if (PyUnicode_READY(substring) == -1)
10620 return NULL;
10621
Victor Stinner794d5672011-10-10 03:21:36 +020010622 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010624 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625
10626 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 if (result == -2)
10629 return NULL;
10630
Christian Heimes217cfd12007-12-02 14:31:20 +000010631 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632}
10633
10634static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010635unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010637 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10638 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641}
10642
Guido van Rossumc2504932007-09-18 19:42:40 +000010643/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010644 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010645static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010646unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010647{
Guido van Rossumc2504932007-09-18 19:42:40 +000010648 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010649 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (_PyUnicode_HASH(self) != -1)
10652 return _PyUnicode_HASH(self);
10653 if (PyUnicode_READY(self) == -1)
10654 return -1;
10655 len = PyUnicode_GET_LENGTH(self);
10656
10657 /* The hash function as a macro, gets expanded three times below. */
10658#define HASH(P) \
10659 x = (Py_uhash_t)*P << 7; \
10660 while (--len >= 0) \
10661 x = (1000003*x) ^ (Py_uhash_t)*P++;
10662
10663 switch (PyUnicode_KIND(self)) {
10664 case PyUnicode_1BYTE_KIND: {
10665 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10666 HASH(c);
10667 break;
10668 }
10669 case PyUnicode_2BYTE_KIND: {
10670 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10671 HASH(s);
10672 break;
10673 }
10674 default: {
10675 Py_UCS4 *l;
10676 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10677 "Impossible switch case in unicode_hash");
10678 l = PyUnicode_4BYTE_DATA(self);
10679 HASH(l);
10680 break;
10681 }
10682 }
10683 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10684
Guido van Rossumc2504932007-09-18 19:42:40 +000010685 if (x == -1)
10686 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010688 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010692PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010695Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
10697static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010700 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010701 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010702 Py_ssize_t start;
10703 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704
Jesus Ceaac451502011-04-20 17:09:23 +020010705 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10706 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 if (PyUnicode_READY(self) == -1)
10710 return NULL;
10711 if (PyUnicode_READY(substring) == -1)
10712 return NULL;
10713
Victor Stinner794d5672011-10-10 03:21:36 +020010714 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010716 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717
10718 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 if (result == -2)
10721 return NULL;
10722
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723 if (result < 0) {
10724 PyErr_SetString(PyExc_ValueError, "substring not found");
10725 return NULL;
10726 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727
Christian Heimes217cfd12007-12-02 14:31:20 +000010728 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729}
10730
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010731PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010732 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010734Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010735at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010736
10737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010738unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740 Py_ssize_t i, length;
10741 int kind;
10742 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743 int cased;
10744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (PyUnicode_READY(self) == -1)
10746 return NULL;
10747 length = PyUnicode_GET_LENGTH(self);
10748 kind = PyUnicode_KIND(self);
10749 data = PyUnicode_DATA(self);
10750
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 if (length == 1)
10753 return PyBool_FromLong(
10754 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010756 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010758 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010759
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 for (i = 0; i < length; i++) {
10762 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010763
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10765 return PyBool_FromLong(0);
10766 else if (!cased && Py_UNICODE_ISLOWER(ch))
10767 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010769 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770}
10771
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010772PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010773 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010774\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010775Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010776at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777
10778static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010779unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 Py_ssize_t i, length;
10782 int kind;
10783 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010784 int cased;
10785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (PyUnicode_READY(self) == -1)
10787 return NULL;
10788 length = PyUnicode_GET_LENGTH(self);
10789 kind = PyUnicode_KIND(self);
10790 data = PyUnicode_DATA(self);
10791
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 if (length == 1)
10794 return PyBool_FromLong(
10795 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010796
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010797 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010799 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010800
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 for (i = 0; i < length; i++) {
10803 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010804
Benjamin Peterson29060642009-01-31 22:14:21 +000010805 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10806 return PyBool_FromLong(0);
10807 else if (!cased && Py_UNICODE_ISUPPER(ch))
10808 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010810 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010811}
10812
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010813PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010814 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010816Return True if S is a titlecased string and there is at least one\n\
10817character in S, i.e. upper- and titlecase characters may only\n\
10818follow uncased characters and lowercase characters only cased ones.\n\
10819Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820
10821static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010822unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 Py_ssize_t i, length;
10825 int kind;
10826 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010827 int cased, previous_is_cased;
10828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 if (PyUnicode_READY(self) == -1)
10830 return NULL;
10831 length = PyUnicode_GET_LENGTH(self);
10832 kind = PyUnicode_KIND(self);
10833 data = PyUnicode_DATA(self);
10834
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836 if (length == 1) {
10837 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10838 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10839 (Py_UNICODE_ISUPPER(ch) != 0));
10840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010842 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010844 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010845
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846 cased = 0;
10847 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 for (i = 0; i < length; i++) {
10849 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010850
Benjamin Peterson29060642009-01-31 22:14:21 +000010851 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10852 if (previous_is_cased)
10853 return PyBool_FromLong(0);
10854 previous_is_cased = 1;
10855 cased = 1;
10856 }
10857 else if (Py_UNICODE_ISLOWER(ch)) {
10858 if (!previous_is_cased)
10859 return PyBool_FromLong(0);
10860 previous_is_cased = 1;
10861 cased = 1;
10862 }
10863 else
10864 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010866 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867}
10868
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010869PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010870 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010872Return True if all characters in S are whitespace\n\
10873and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874
10875static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010876unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 Py_ssize_t i, length;
10879 int kind;
10880 void *data;
10881
10882 if (PyUnicode_READY(self) == -1)
10883 return NULL;
10884 length = PyUnicode_GET_LENGTH(self);
10885 kind = PyUnicode_KIND(self);
10886 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 if (length == 1)
10890 return PyBool_FromLong(
10891 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010893 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010895 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 for (i = 0; i < length; i++) {
10898 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010899 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010902 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903}
10904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010905PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010906 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010907\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010908Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010909and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010910
10911static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010912unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010913{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010914 Py_ssize_t i, length;
10915 int kind;
10916 void *data;
10917
10918 if (PyUnicode_READY(self) == -1)
10919 return NULL;
10920 length = PyUnicode_GET_LENGTH(self);
10921 kind = PyUnicode_KIND(self);
10922 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010923
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010924 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (length == 1)
10926 return PyBool_FromLong(
10927 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010928
10929 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010931 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 for (i = 0; i < length; i++) {
10934 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010935 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010936 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010937 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010938}
10939
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010940PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010941 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010942\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010943Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010944and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010945
10946static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010947unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010948{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 int kind;
10950 void *data;
10951 Py_ssize_t len, i;
10952
10953 if (PyUnicode_READY(self) == -1)
10954 return NULL;
10955
10956 kind = PyUnicode_KIND(self);
10957 data = PyUnicode_DATA(self);
10958 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010959
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010960 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 if (len == 1) {
10962 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10963 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10964 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010965
10966 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010968 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 for (i = 0; i < len; i++) {
10971 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010972 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010975 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010976}
10977
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010978PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010981Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010982False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010985unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010987 Py_ssize_t i, length;
10988 int kind;
10989 void *data;
10990
10991 if (PyUnicode_READY(self) == -1)
10992 return NULL;
10993 length = PyUnicode_GET_LENGTH(self);
10994 kind = PyUnicode_KIND(self);
10995 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 if (length == 1)
10999 return PyBool_FromLong(
11000 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011002 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 for (i = 0; i < length; i++) {
11007 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011010 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011}
11012
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011013PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011016Return True if all characters in S are digits\n\
11017and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
11019static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011020unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 Py_ssize_t i, length;
11023 int kind;
11024 void *data;
11025
11026 if (PyUnicode_READY(self) == -1)
11027 return NULL;
11028 length = PyUnicode_GET_LENGTH(self);
11029 kind = PyUnicode_KIND(self);
11030 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031
Guido van Rossumd57fd912000-03-10 22:53:23 +000011032 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (length == 1) {
11034 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11035 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11036 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011038 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011040 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 for (i = 0; i < length; i++) {
11043 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011044 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011046 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011047}
11048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011049PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011050 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011051\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011052Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011053False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054
11055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011056unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 Py_ssize_t i, length;
11059 int kind;
11060 void *data;
11061
11062 if (PyUnicode_READY(self) == -1)
11063 return NULL;
11064 length = PyUnicode_GET_LENGTH(self);
11065 kind = PyUnicode_KIND(self);
11066 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (length == 1)
11070 return PyBool_FromLong(
11071 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011073 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011075 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 for (i = 0; i < length; i++) {
11078 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011079 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011081 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011082}
11083
Martin v. Löwis47383402007-08-15 07:32:56 +000011084int
11085PyUnicode_IsIdentifier(PyObject *self)
11086{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 int kind;
11088 void *data;
11089 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011090 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (PyUnicode_READY(self) == -1) {
11093 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011094 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 }
11096
11097 /* Special case for empty strings */
11098 if (PyUnicode_GET_LENGTH(self) == 0)
11099 return 0;
11100 kind = PyUnicode_KIND(self);
11101 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011102
11103 /* PEP 3131 says that the first character must be in
11104 XID_Start and subsequent characters in XID_Continue,
11105 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011106 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011107 letters, digits, underscore). However, given the current
11108 definition of XID_Start and XID_Continue, it is sufficient
11109 to check just for these, except that _ must be allowed
11110 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011112 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011113 return 0;
11114
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011115 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011118 return 1;
11119}
11120
11121PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011123\n\
11124Return True if S is a valid identifier according\n\
11125to the language definition.");
11126
11127static PyObject*
11128unicode_isidentifier(PyObject *self)
11129{
11130 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11131}
11132
Georg Brandl559e5d72008-06-11 18:37:52 +000011133PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011134 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011135\n\
11136Return True if all characters in S are considered\n\
11137printable in repr() or S is empty, False otherwise.");
11138
11139static PyObject*
11140unicode_isprintable(PyObject *self)
11141{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 Py_ssize_t i, length;
11143 int kind;
11144 void *data;
11145
11146 if (PyUnicode_READY(self) == -1)
11147 return NULL;
11148 length = PyUnicode_GET_LENGTH(self);
11149 kind = PyUnicode_KIND(self);
11150 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011151
11152 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 if (length == 1)
11154 return PyBool_FromLong(
11155 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157 for (i = 0; i < length; i++) {
11158 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011159 Py_RETURN_FALSE;
11160 }
11161 }
11162 Py_RETURN_TRUE;
11163}
11164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011165PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011166 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167\n\
11168Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011169iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170
11171static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011172unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011173{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011174 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175}
11176
Martin v. Löwis18e16552006-02-15 17:27:45 +000011177static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178unicode_length(PyUnicodeObject *self)
11179{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (PyUnicode_READY(self) == -1)
11181 return -1;
11182 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183}
11184
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011185PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011186 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011188Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011189done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190
11191static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011192unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011194 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 Py_UCS4 fillchar = ' ';
11196
11197 if (PyUnicode_READY(self) == -1)
11198 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011199
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011200 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 return NULL;
11202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204 Py_INCREF(self);
11205 return (PyObject*) self;
11206 }
11207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209}
11210
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011211PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011214Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
11216static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011217unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 return fixup(self, fixlower);
11220}
11221
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011222#define LEFTSTRIP 0
11223#define RIGHTSTRIP 1
11224#define BOTHSTRIP 2
11225
11226/* Arrays indexed by above */
11227static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11228
11229#define STRIPNAME(i) (stripformat[i]+3)
11230
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011231/* externally visible for str.strip(unicode) */
11232PyObject *
11233_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 void *data;
11236 int kind;
11237 Py_ssize_t i, j, len;
11238 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11241 return NULL;
11242
11243 kind = PyUnicode_KIND(self);
11244 data = PyUnicode_DATA(self);
11245 len = PyUnicode_GET_LENGTH(self);
11246 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11247 PyUnicode_DATA(sepobj),
11248 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011249
Benjamin Peterson14339b62009-01-31 16:36:08 +000011250 i = 0;
11251 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 while (i < len &&
11253 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011254 i++;
11255 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011256 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011257
Benjamin Peterson14339b62009-01-31 16:36:08 +000011258 j = len;
11259 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 do {
11261 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 } while (j >= i &&
11263 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011265 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011266
Victor Stinner12bab6d2011-10-01 01:53:49 +020011267 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011268}
11269
11270PyObject*
11271PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11272{
11273 unsigned char *data;
11274 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011275 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276
Victor Stinnerde636f32011-10-01 03:55:54 +020011277 if (PyUnicode_READY(self) == -1)
11278 return NULL;
11279
11280 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11281
Victor Stinner12bab6d2011-10-01 01:53:49 +020011282 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011284 if (PyUnicode_CheckExact(self)) {
11285 Py_INCREF(self);
11286 return self;
11287 }
11288 else
11289 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011290 }
11291
Victor Stinner12bab6d2011-10-01 01:53:49 +020011292 length = end - start;
11293 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011294 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295
Victor Stinnerde636f32011-10-01 03:55:54 +020011296 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011297 PyErr_SetString(PyExc_IndexError, "string index out of range");
11298 return NULL;
11299 }
11300
Victor Stinnerb9275c12011-10-05 14:01:42 +020011301 if (PyUnicode_IS_ASCII(self)) {
11302 kind = PyUnicode_KIND(self);
11303 data = PyUnicode_1BYTE_DATA(self);
11304 return unicode_fromascii(data + start, length);
11305 }
11306 else {
11307 kind = PyUnicode_KIND(self);
11308 data = PyUnicode_1BYTE_DATA(self);
11309 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011310 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011311 length);
11312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314
11315static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011316do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 int kind;
11319 void *data;
11320 Py_ssize_t len, i, j;
11321
11322 if (PyUnicode_READY(self) == -1)
11323 return NULL;
11324
11325 kind = PyUnicode_KIND(self);
11326 data = PyUnicode_DATA(self);
11327 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011328
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 i = 0;
11330 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 i++;
11333 }
11334 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011335
Benjamin Peterson14339b62009-01-31 16:36:08 +000011336 j = len;
11337 if (striptype != LEFTSTRIP) {
11338 do {
11339 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011341 j++;
11342 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011343
Victor Stinner12bab6d2011-10-01 01:53:49 +020011344 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345}
11346
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011347
11348static PyObject *
11349do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11350{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011351 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011352
Benjamin Peterson14339b62009-01-31 16:36:08 +000011353 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11354 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011355
Benjamin Peterson14339b62009-01-31 16:36:08 +000011356 if (sep != NULL && sep != Py_None) {
11357 if (PyUnicode_Check(sep))
11358 return _PyUnicode_XStrip(self, striptype, sep);
11359 else {
11360 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 "%s arg must be None or str",
11362 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011363 return NULL;
11364 }
11365 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011366
Benjamin Peterson14339b62009-01-31 16:36:08 +000011367 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011368}
11369
11370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011371PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011372 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011373\n\
11374Return a copy of the string S with leading and trailing\n\
11375whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011376If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011377
11378static PyObject *
11379unicode_strip(PyUnicodeObject *self, PyObject *args)
11380{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011381 if (PyTuple_GET_SIZE(args) == 0)
11382 return do_strip(self, BOTHSTRIP); /* Common case */
11383 else
11384 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011385}
11386
11387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011388PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011389 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011390\n\
11391Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011392If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011393
11394static PyObject *
11395unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11396{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011397 if (PyTuple_GET_SIZE(args) == 0)
11398 return do_strip(self, LEFTSTRIP); /* Common case */
11399 else
11400 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011401}
11402
11403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011405 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011406\n\
11407Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011408If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011409
11410static PyObject *
11411unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11412{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011413 if (PyTuple_GET_SIZE(args) == 0)
11414 return do_strip(self, RIGHTSTRIP); /* Common case */
11415 else
11416 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011417}
11418
11419
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011421unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422{
11423 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
Georg Brandl222de0f2009-04-12 12:01:50 +000011426 if (len < 1) {
11427 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011428 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Tim Peters7a29bd52001-09-12 03:03:31 +000011431 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432 /* no repeat, return original string */
11433 Py_INCREF(str);
11434 return (PyObject*) str;
11435 }
Tim Peters8f422462000-09-09 06:13:41 +000011436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 if (PyUnicode_READY(str) == -1)
11438 return NULL;
11439
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011440 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011441 PyErr_SetString(PyExc_OverflowError,
11442 "repeated string is too long");
11443 return NULL;
11444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448 if (!u)
11449 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011450 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (PyUnicode_GET_LENGTH(str) == 1) {
11453 const int kind = PyUnicode_KIND(str);
11454 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11455 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011456 if (kind == PyUnicode_1BYTE_KIND)
11457 memset(to, (unsigned char)fill_char, len);
11458 else {
11459 for (n = 0; n < len; ++n)
11460 PyUnicode_WRITE(kind, to, n, fill_char);
11461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 }
11463 else {
11464 /* number of characters copied this far */
11465 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011466 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 char *to = (char *) PyUnicode_DATA(u);
11468 Py_MEMCPY(to, PyUnicode_DATA(str),
11469 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 n = (done <= nchars-done) ? done : nchars-done;
11472 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011473 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475 }
11476
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011477 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478 return (PyObject*) u;
11479}
11480
Alexander Belopolsky40018472011-02-26 01:02:56 +000011481PyObject *
11482PyUnicode_Replace(PyObject *obj,
11483 PyObject *subobj,
11484 PyObject *replobj,
11485 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486{
11487 PyObject *self;
11488 PyObject *str1;
11489 PyObject *str2;
11490 PyObject *result;
11491
11492 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011493 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011494 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011496 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011497 Py_DECREF(self);
11498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 }
11500 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011501 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 Py_DECREF(self);
11503 Py_DECREF(str1);
11504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 Py_DECREF(self);
11508 Py_DECREF(str1);
11509 Py_DECREF(str2);
11510 return result;
11511}
11512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011513PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011514 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515\n\
11516Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011517old replaced by new. If the optional argument count is\n\
11518given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519
11520static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011523 PyObject *str1;
11524 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011525 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 PyObject *result;
11527
Martin v. Löwis18e16552006-02-15 17:27:45 +000011528 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011530 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 str1 = PyUnicode_FromObject(str1);
11533 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11534 return NULL;
11535 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011536 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011537 Py_DECREF(str1);
11538 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011539 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540
11541 result = replace(self, str1, str2, maxcount);
11542
11543 Py_DECREF(str1);
11544 Py_DECREF(str2);
11545 return result;
11546}
11547
Alexander Belopolsky40018472011-02-26 01:02:56 +000011548static PyObject *
11549unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011550{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011551 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 Py_ssize_t isize;
11553 Py_ssize_t osize, squote, dquote, i, o;
11554 Py_UCS4 max, quote;
11555 int ikind, okind;
11556 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011559 return NULL;
11560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011561 isize = PyUnicode_GET_LENGTH(unicode);
11562 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 /* Compute length of output, quote characters, and
11565 maximum character */
11566 osize = 2; /* quotes */
11567 max = 127;
11568 squote = dquote = 0;
11569 ikind = PyUnicode_KIND(unicode);
11570 for (i = 0; i < isize; i++) {
11571 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11572 switch (ch) {
11573 case '\'': squote++; osize++; break;
11574 case '"': dquote++; osize++; break;
11575 case '\\': case '\t': case '\r': case '\n':
11576 osize += 2; break;
11577 default:
11578 /* Fast-path ASCII */
11579 if (ch < ' ' || ch == 0x7f)
11580 osize += 4; /* \xHH */
11581 else if (ch < 0x7f)
11582 osize++;
11583 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11584 osize++;
11585 max = ch > max ? ch : max;
11586 }
11587 else if (ch < 0x100)
11588 osize += 4; /* \xHH */
11589 else if (ch < 0x10000)
11590 osize += 6; /* \uHHHH */
11591 else
11592 osize += 10; /* \uHHHHHHHH */
11593 }
11594 }
11595
11596 quote = '\'';
11597 if (squote) {
11598 if (dquote)
11599 /* Both squote and dquote present. Use squote,
11600 and escape them */
11601 osize += squote;
11602 else
11603 quote = '"';
11604 }
11605
11606 repr = PyUnicode_New(osize, max);
11607 if (repr == NULL)
11608 return NULL;
11609 okind = PyUnicode_KIND(repr);
11610 odata = PyUnicode_DATA(repr);
11611
11612 PyUnicode_WRITE(okind, odata, 0, quote);
11613 PyUnicode_WRITE(okind, odata, osize-1, quote);
11614
11615 for (i = 0, o = 1; i < isize; i++) {
11616 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011617
11618 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 if ((ch == quote) || (ch == '\\')) {
11620 PyUnicode_WRITE(okind, odata, o++, '\\');
11621 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011622 continue;
11623 }
11624
Benjamin Peterson29060642009-01-31 22:14:21 +000011625 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011626 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 PyUnicode_WRITE(okind, odata, o++, '\\');
11628 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011629 }
11630 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 PyUnicode_WRITE(okind, odata, o++, '\\');
11632 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011633 }
11634 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 PyUnicode_WRITE(okind, odata, o++, '\\');
11636 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011637 }
11638
11639 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011640 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 PyUnicode_WRITE(okind, odata, o++, '\\');
11642 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011643 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11644 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011645 }
11646
Georg Brandl559e5d72008-06-11 18:37:52 +000011647 /* Copy ASCII characters as-is */
11648 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011650 }
11651
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011653 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011654 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011655 (categories Z* and C* except ASCII space)
11656 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011658 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011659 if (ch <= 0xff) {
11660 PyUnicode_WRITE(okind, odata, o++, '\\');
11661 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011662 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11663 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011664 }
11665 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 else if (ch >= 0x10000) {
11667 PyUnicode_WRITE(okind, odata, o++, '\\');
11668 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011669 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11670 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11671 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11672 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11673 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11674 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11675 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11676 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011677 }
11678 /* Map 16-bit characters to '\uxxxx' */
11679 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 PyUnicode_WRITE(okind, odata, o++, '\\');
11681 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011682 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11683 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11684 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11685 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011686 }
11687 }
11688 /* Copy characters as-is */
11689 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011690 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011691 }
11692 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011693 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011695 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011696 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697}
11698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011699PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011700 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701\n\
11702Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011703such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704arguments start and end are interpreted as in slice notation.\n\
11705\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011706Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
11708static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710{
Jesus Ceaac451502011-04-20 17:09:23 +020011711 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011712 Py_ssize_t start;
11713 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011714 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
Jesus Ceaac451502011-04-20 17:09:23 +020011716 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11717 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 if (PyUnicode_READY(self) == -1)
11721 return NULL;
11722 if (PyUnicode_READY(substring) == -1)
11723 return NULL;
11724
Victor Stinner794d5672011-10-10 03:21:36 +020011725 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011726 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011727 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728
11729 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 if (result == -2)
11732 return NULL;
11733
Christian Heimes217cfd12007-12-02 14:31:20 +000011734 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735}
11736
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011738 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011740Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
11742static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744{
Jesus Ceaac451502011-04-20 17:09:23 +020011745 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011746 Py_ssize_t start;
11747 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011748 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Jesus Ceaac451502011-04-20 17:09:23 +020011750 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11751 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011752 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (PyUnicode_READY(self) == -1)
11755 return NULL;
11756 if (PyUnicode_READY(substring) == -1)
11757 return NULL;
11758
Victor Stinner794d5672011-10-10 03:21:36 +020011759 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011761 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (result == -2)
11766 return NULL;
11767
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 if (result < 0) {
11769 PyErr_SetString(PyExc_ValueError, "substring not found");
11770 return NULL;
11771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772
Christian Heimes217cfd12007-12-02 14:31:20 +000011773 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774}
11775
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011776PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011777 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011779Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011780done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
11782static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011783unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011785 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 Py_UCS4 fillchar = ' ';
11787
Victor Stinnere9a29352011-10-01 02:14:59 +020011788 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011790
Victor Stinnere9a29352011-10-01 02:14:59 +020011791 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 return NULL;
11793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011795 Py_INCREF(self);
11796 return (PyObject*) self;
11797 }
11798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800}
11801
Alexander Belopolsky40018472011-02-26 01:02:56 +000011802PyObject *
11803PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804{
11805 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011806
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807 s = PyUnicode_FromObject(s);
11808 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011809 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 if (sep != NULL) {
11811 sep = PyUnicode_FromObject(sep);
11812 if (sep == NULL) {
11813 Py_DECREF(s);
11814 return NULL;
11815 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
11817
Victor Stinner9310abb2011-10-05 00:59:23 +020011818 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819
11820 Py_DECREF(s);
11821 Py_XDECREF(sep);
11822 return result;
11823}
11824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011825PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827\n\
11828Return a list of the words in S, using sep as the\n\
11829delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011830splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011831whitespace string is a separator and empty strings are\n\
11832removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833
11834static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011835unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
11837 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011838 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839
Martin v. Löwis18e16552006-02-15 17:27:45 +000011840 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841 return NULL;
11842
11843 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011844 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011846 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849}
11850
Thomas Wouters477c8d52006-05-27 19:21:47 +000011851PyObject *
11852PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11853{
11854 PyObject* str_obj;
11855 PyObject* sep_obj;
11856 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 int kind1, kind2, kind;
11858 void *buf1 = NULL, *buf2 = NULL;
11859 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011860
11861 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011862 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011864 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011866 Py_DECREF(str_obj);
11867 return NULL;
11868 }
11869
Victor Stinner14f8f022011-10-05 20:58:25 +020011870 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011871 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011872 kind = Py_MAX(kind1, kind2);
11873 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011874 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011875 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (!buf1)
11877 goto onError;
11878 buf2 = PyUnicode_DATA(sep_obj);
11879 if (kind2 != kind)
11880 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11881 if (!buf2)
11882 goto onError;
11883 len1 = PyUnicode_GET_LENGTH(str_obj);
11884 len2 = PyUnicode_GET_LENGTH(sep_obj);
11885
Victor Stinner14f8f022011-10-05 20:58:25 +020011886 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011888 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11889 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11890 else
11891 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 break;
11893 case PyUnicode_2BYTE_KIND:
11894 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11895 break;
11896 case PyUnicode_4BYTE_KIND:
11897 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11898 break;
11899 default:
11900 assert(0);
11901 out = 0;
11902 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011903
11904 Py_DECREF(sep_obj);
11905 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 if (kind1 != kind)
11907 PyMem_Free(buf1);
11908 if (kind2 != kind)
11909 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011910
11911 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 onError:
11913 Py_DECREF(sep_obj);
11914 Py_DECREF(str_obj);
11915 if (kind1 != kind && buf1)
11916 PyMem_Free(buf1);
11917 if (kind2 != kind && buf2)
11918 PyMem_Free(buf2);
11919 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011920}
11921
11922
11923PyObject *
11924PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11925{
11926 PyObject* str_obj;
11927 PyObject* sep_obj;
11928 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 int kind1, kind2, kind;
11930 void *buf1 = NULL, *buf2 = NULL;
11931 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011932
11933 str_obj = PyUnicode_FromObject(str_in);
11934 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011936 sep_obj = PyUnicode_FromObject(sep_in);
11937 if (!sep_obj) {
11938 Py_DECREF(str_obj);
11939 return NULL;
11940 }
11941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 kind1 = PyUnicode_KIND(str_in);
11943 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011944 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011945 buf1 = PyUnicode_DATA(str_in);
11946 if (kind1 != kind)
11947 buf1 = _PyUnicode_AsKind(str_in, kind);
11948 if (!buf1)
11949 goto onError;
11950 buf2 = PyUnicode_DATA(sep_obj);
11951 if (kind2 != kind)
11952 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11953 if (!buf2)
11954 goto onError;
11955 len1 = PyUnicode_GET_LENGTH(str_obj);
11956 len2 = PyUnicode_GET_LENGTH(sep_obj);
11957
11958 switch(PyUnicode_KIND(str_in)) {
11959 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011960 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11961 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11962 else
11963 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011964 break;
11965 case PyUnicode_2BYTE_KIND:
11966 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11967 break;
11968 case PyUnicode_4BYTE_KIND:
11969 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11970 break;
11971 default:
11972 assert(0);
11973 out = 0;
11974 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011975
11976 Py_DECREF(sep_obj);
11977 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (kind1 != kind)
11979 PyMem_Free(buf1);
11980 if (kind2 != kind)
11981 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011982
11983 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984 onError:
11985 Py_DECREF(sep_obj);
11986 Py_DECREF(str_obj);
11987 if (kind1 != kind && buf1)
11988 PyMem_Free(buf1);
11989 if (kind2 != kind && buf2)
11990 PyMem_Free(buf2);
11991 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011992}
11993
11994PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011996\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011997Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011999found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012000
12001static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012002unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012003{
Victor Stinner9310abb2011-10-05 00:59:23 +020012004 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012005}
12006
12007PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012008 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012009\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012010Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012011the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012012separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012013
12014static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012015unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012016{
Victor Stinner9310abb2011-10-05 00:59:23 +020012017 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012018}
12019
Alexander Belopolsky40018472011-02-26 01:02:56 +000012020PyObject *
12021PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012022{
12023 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012024
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012025 s = PyUnicode_FromObject(s);
12026 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012027 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012028 if (sep != NULL) {
12029 sep = PyUnicode_FromObject(sep);
12030 if (sep == NULL) {
12031 Py_DECREF(s);
12032 return NULL;
12033 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012034 }
12035
Victor Stinner9310abb2011-10-05 00:59:23 +020012036 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012037
12038 Py_DECREF(s);
12039 Py_XDECREF(sep);
12040 return result;
12041}
12042
12043PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012045\n\
12046Return a list of the words in S, using sep as the\n\
12047delimiter string, starting at the end of the string and\n\
12048working to the front. If maxsplit is given, at most maxsplit\n\
12049splits are done. If sep is not specified, any whitespace string\n\
12050is a separator.");
12051
12052static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012053unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012054{
12055 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012056 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012057
Martin v. Löwis18e16552006-02-15 17:27:45 +000012058 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012059 return NULL;
12060
12061 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012062 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012063 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012064 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012065 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012066 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012067}
12068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012069PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071\n\
12072Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012073Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012074is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
12076static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012077unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012079 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012080 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012082 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12083 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012084 return NULL;
12085
Guido van Rossum86662912000-04-11 15:38:46 +000012086 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087}
12088
12089static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012090PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091{
Walter Dörwald346737f2007-05-31 10:44:43 +000012092 if (PyUnicode_CheckExact(self)) {
12093 Py_INCREF(self);
12094 return self;
12095 } else
12096 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012097 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012098}
12099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012100PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012101 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102\n\
12103Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012104and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105
12106static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012107unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109 return fixup(self, fixswapcase);
12110}
12111
Georg Brandlceee0772007-11-27 23:48:05 +000012112PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012113 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012114\n\
12115Return a translation table usable for str.translate().\n\
12116If there is only one argument, it must be a dictionary mapping Unicode\n\
12117ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012118Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012119If there are two arguments, they must be strings of equal length, and\n\
12120in the resulting dictionary, each character in x will be mapped to the\n\
12121character at the same position in y. If there is a third argument, it\n\
12122must be a string, whose characters will be mapped to None in the result.");
12123
12124static PyObject*
12125unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12126{
12127 PyObject *x, *y = NULL, *z = NULL;
12128 PyObject *new = NULL, *key, *value;
12129 Py_ssize_t i = 0;
12130 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012131
Georg Brandlceee0772007-11-27 23:48:05 +000012132 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12133 return NULL;
12134 new = PyDict_New();
12135 if (!new)
12136 return NULL;
12137 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 int x_kind, y_kind, z_kind;
12139 void *x_data, *y_data, *z_data;
12140
Georg Brandlceee0772007-11-27 23:48:05 +000012141 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012142 if (!PyUnicode_Check(x)) {
12143 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12144 "be a string if there is a second argument");
12145 goto err;
12146 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012147 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012148 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12149 "arguments must have equal length");
12150 goto err;
12151 }
12152 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 x_kind = PyUnicode_KIND(x);
12154 y_kind = PyUnicode_KIND(y);
12155 x_data = PyUnicode_DATA(x);
12156 y_data = PyUnicode_DATA(y);
12157 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12158 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12159 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012160 if (!key || !value)
12161 goto err;
12162 res = PyDict_SetItem(new, key, value);
12163 Py_DECREF(key);
12164 Py_DECREF(value);
12165 if (res < 0)
12166 goto err;
12167 }
12168 /* create entries for deleting chars in z */
12169 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 z_kind = PyUnicode_KIND(z);
12171 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012172 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012174 if (!key)
12175 goto err;
12176 res = PyDict_SetItem(new, key, Py_None);
12177 Py_DECREF(key);
12178 if (res < 0)
12179 goto err;
12180 }
12181 }
12182 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 int kind;
12184 void *data;
12185
Georg Brandlceee0772007-11-27 23:48:05 +000012186 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012187 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012188 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12189 "to maketrans it must be a dict");
12190 goto err;
12191 }
12192 /* copy entries into the new dict, converting string keys to int keys */
12193 while (PyDict_Next(x, &i, &key, &value)) {
12194 if (PyUnicode_Check(key)) {
12195 /* convert string keys to integer keys */
12196 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012197 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012198 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12199 "table must be of length 1");
12200 goto err;
12201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 kind = PyUnicode_KIND(key);
12203 data = PyUnicode_DATA(key);
12204 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012205 if (!newkey)
12206 goto err;
12207 res = PyDict_SetItem(new, newkey, value);
12208 Py_DECREF(newkey);
12209 if (res < 0)
12210 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012211 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012212 /* just keep integer keys */
12213 if (PyDict_SetItem(new, key, value) < 0)
12214 goto err;
12215 } else {
12216 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12217 "be strings or integers");
12218 goto err;
12219 }
12220 }
12221 }
12222 return new;
12223 err:
12224 Py_DECREF(new);
12225 return NULL;
12226}
12227
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012228PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012229 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230\n\
12231Return a copy of the string S, where all characters have been mapped\n\
12232through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012233Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012234Unmapped characters are left untouched. Characters mapped to None\n\
12235are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
12237static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241}
12242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012243PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012246Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247
12248static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012249unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 return fixup(self, fixupper);
12252}
12253
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012254PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012257Pad a numeric string S with zeros on the left, to fill a field\n\
12258of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259
12260static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012261unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012263 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012264 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012265 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012266 int kind;
12267 void *data;
12268 Py_UCS4 chr;
12269
12270 if (PyUnicode_READY(self) == -1)
12271 return NULL;
12272
Martin v. Löwis18e16552006-02-15 17:27:45 +000012273 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 return NULL;
12275
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012277 if (PyUnicode_CheckExact(self)) {
12278 Py_INCREF(self);
12279 return (PyObject*) self;
12280 }
12281 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012282 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283 }
12284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
12287 u = pad(self, fill, 0, '0');
12288
Walter Dörwald068325e2002-04-15 13:36:47 +000012289 if (u == NULL)
12290 return NULL;
12291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 kind = PyUnicode_KIND(u);
12293 data = PyUnicode_DATA(u);
12294 chr = PyUnicode_READ(kind, data, fill);
12295
12296 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 PyUnicode_WRITE(kind, data, 0, chr);
12299 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 }
12301
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012302 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012303 return (PyObject*) u;
12304}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305
12306#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012307static PyObject *
12308unicode__decimal2ascii(PyObject *self)
12309{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012311}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312#endif
12313
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012314PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012315 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012316\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012317Return True if S starts with the specified prefix, False otherwise.\n\
12318With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012319With optional end, stop comparing S at that position.\n\
12320prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321
12322static PyObject *
12323unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012325{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012326 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012328 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012329 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012330 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012331
Jesus Ceaac451502011-04-20 17:09:23 +020012332 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012334 if (PyTuple_Check(subobj)) {
12335 Py_ssize_t i;
12336 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12337 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012339 if (substring == NULL)
12340 return NULL;
12341 result = tailmatch(self, substring, start, end, -1);
12342 Py_DECREF(substring);
12343 if (result) {
12344 Py_RETURN_TRUE;
12345 }
12346 }
12347 /* nothing matched */
12348 Py_RETURN_FALSE;
12349 }
12350 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012351 if (substring == NULL) {
12352 if (PyErr_ExceptionMatches(PyExc_TypeError))
12353 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12354 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012355 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012356 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012357 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012358 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012359 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012360}
12361
12362
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012363PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012364 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012366Return True if S ends with the specified suffix, False otherwise.\n\
12367With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012368With optional end, stop comparing S at that position.\n\
12369suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370
12371static PyObject *
12372unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012375 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012377 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012378 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012379 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
Jesus Ceaac451502011-04-20 17:09:23 +020012381 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012382 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012383 if (PyTuple_Check(subobj)) {
12384 Py_ssize_t i;
12385 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12386 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012388 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012389 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012390 result = tailmatch(self, substring, start, end, +1);
12391 Py_DECREF(substring);
12392 if (result) {
12393 Py_RETURN_TRUE;
12394 }
12395 }
12396 Py_RETURN_FALSE;
12397 }
12398 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012399 if (substring == NULL) {
12400 if (PyErr_ExceptionMatches(PyExc_TypeError))
12401 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12402 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012403 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012404 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012405 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012407 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408}
12409
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012410#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012411
12412PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012413 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012414\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012415Return a formatted version of S, using substitutions from args and kwargs.\n\
12416The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012417
Eric Smith27bbca62010-11-04 17:06:58 +000012418PyDoc_STRVAR(format_map__doc__,
12419 "S.format_map(mapping) -> str\n\
12420\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012421Return a formatted version of S, using substitutions from mapping.\n\
12422The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012423
Eric Smith4a7d76d2008-05-30 18:10:19 +000012424static PyObject *
12425unicode__format__(PyObject* self, PyObject* args)
12426{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012427 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012428
12429 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12430 return NULL;
12431
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012432 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012434 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012435}
12436
Eric Smith8c663262007-08-25 02:26:07 +000012437PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012438 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012439\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012440Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012441
12442static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012443unicode__sizeof__(PyUnicodeObject *v)
12444{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012445 Py_ssize_t size;
12446
12447 /* If it's a compact object, account for base structure +
12448 character data. */
12449 if (PyUnicode_IS_COMPACT_ASCII(v))
12450 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12451 else if (PyUnicode_IS_COMPACT(v))
12452 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012453 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 else {
12455 /* If it is a two-block object, account for base object, and
12456 for character block if present. */
12457 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012458 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012460 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012461 }
12462 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012463 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012464 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012465 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012466 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012467 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012468
12469 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012470}
12471
12472PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012474
12475static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012476unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012477{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012478 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479 if (!copy)
12480 return NULL;
12481 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012482}
12483
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484static PyMethodDef unicode_methods[] = {
12485
12486 /* Order is according to common usage: often used methods should
12487 appear first, since lookup is done sequentially. */
12488
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012489 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012490 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12491 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012492 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012493 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12494 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12495 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12496 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12497 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12498 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12499 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012500 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012501 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12502 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12503 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012504 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012505 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12506 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12507 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012508 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012509 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012510 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012512 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12513 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12514 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12515 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12516 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12517 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12518 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12519 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12520 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12521 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12522 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12523 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12524 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12525 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012526 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012527 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012528 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012529 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012530 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012531 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012532 {"maketrans", (PyCFunction) unicode_maketrans,
12533 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012534 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012535#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012536 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537#endif
12538
12539#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012540 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012541 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542#endif
12543
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545 {NULL, NULL}
12546};
12547
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012548static PyObject *
12549unicode_mod(PyObject *v, PyObject *w)
12550{
Brian Curtindfc80e32011-08-10 20:28:54 -050012551 if (!PyUnicode_Check(v))
12552 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012553 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012554}
12555
12556static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012557 0, /*nb_add*/
12558 0, /*nb_subtract*/
12559 0, /*nb_multiply*/
12560 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012561};
12562
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012564 (lenfunc) unicode_length, /* sq_length */
12565 PyUnicode_Concat, /* sq_concat */
12566 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12567 (ssizeargfunc) unicode_getitem, /* sq_item */
12568 0, /* sq_slice */
12569 0, /* sq_ass_item */
12570 0, /* sq_ass_slice */
12571 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572};
12573
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012574static PyObject*
12575unicode_subscript(PyUnicodeObject* self, PyObject* item)
12576{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 if (PyUnicode_READY(self) == -1)
12578 return NULL;
12579
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012580 if (PyIndex_Check(item)) {
12581 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012582 if (i == -1 && PyErr_Occurred())
12583 return NULL;
12584 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012586 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012587 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012588 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012589 PyObject *result;
12590 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012591 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012592 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012594 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012596 return NULL;
12597 }
12598
12599 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012600 return PyUnicode_New(0, 0);
12601 } else if (start == 0 && step == 1 &&
12602 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012603 PyUnicode_CheckExact(self)) {
12604 Py_INCREF(self);
12605 return (PyObject *)self;
12606 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012607 return PyUnicode_Substring((PyObject*)self,
12608 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012609 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012610 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012611 src_kind = PyUnicode_KIND(self);
12612 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012613 if (!PyUnicode_IS_ASCII(self)) {
12614 kind_limit = kind_maxchar_limit(src_kind);
12615 max_char = 0;
12616 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12617 ch = PyUnicode_READ(src_kind, src_data, cur);
12618 if (ch > max_char) {
12619 max_char = ch;
12620 if (max_char >= kind_limit)
12621 break;
12622 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012623 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012624 }
Victor Stinner55c99112011-10-13 01:17:06 +020012625 else
12626 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012627 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012628 if (result == NULL)
12629 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012630 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012631 dest_data = PyUnicode_DATA(result);
12632
12633 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012634 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12635 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012636 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012637 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012638 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012639 } else {
12640 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12641 return NULL;
12642 }
12643}
12644
12645static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012646 (lenfunc)unicode_length, /* mp_length */
12647 (binaryfunc)unicode_subscript, /* mp_subscript */
12648 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012649};
12650
Guido van Rossumd57fd912000-03-10 22:53:23 +000012651
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652/* Helpers for PyUnicode_Format() */
12653
12654static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012655getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012656{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012657 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012659 (*p_argidx)++;
12660 if (arglen < 0)
12661 return args;
12662 else
12663 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 }
12665 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 return NULL;
12668}
12669
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012670/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012672static PyObject *
12673formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012675 char *p;
12676 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012678
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679 x = PyFloat_AsDouble(v);
12680 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012681 return NULL;
12682
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012685
Eric Smith0923d1d2009-04-16 20:16:10 +000012686 p = PyOS_double_to_string(x, type, prec,
12687 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012688 if (p == NULL)
12689 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012691 PyMem_Free(p);
12692 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693}
12694
Tim Peters38fd5b62000-09-21 05:43:11 +000012695static PyObject*
12696formatlong(PyObject *val, int flags, int prec, int type)
12697{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012698 char *buf;
12699 int len;
12700 PyObject *str; /* temporary string object. */
12701 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012702
Benjamin Peterson14339b62009-01-31 16:36:08 +000012703 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12704 if (!str)
12705 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 Py_DECREF(str);
12708 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012709}
12710
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012711static Py_UCS4
12712formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012714 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012715 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012717 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 goto onError;
12720 }
12721 else {
12722 /* Integer input truncated to a character */
12723 long x;
12724 x = PyLong_AsLong(v);
12725 if (x == -1 && PyErr_Occurred())
12726 goto onError;
12727
12728 if (x < 0 || x > 0x10ffff) {
12729 PyErr_SetString(PyExc_OverflowError,
12730 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012731 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012732 }
12733
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012734 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012735 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012736
Benjamin Peterson29060642009-01-31 22:14:21 +000012737 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012738 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012740 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741}
12742
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012743static int
12744repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12745{
12746 int r;
12747 assert(count > 0);
12748 assert(PyUnicode_Check(obj));
12749 if (count > 5) {
12750 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12751 if (repeated == NULL)
12752 return -1;
12753 r = _PyAccu_Accumulate(acc, repeated);
12754 Py_DECREF(repeated);
12755 return r;
12756 }
12757 else {
12758 do {
12759 if (_PyAccu_Accumulate(acc, obj))
12760 return -1;
12761 } while (--count);
12762 return 0;
12763 }
12764}
12765
Alexander Belopolsky40018472011-02-26 01:02:56 +000012766PyObject *
12767PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 void *fmt;
12770 int fmtkind;
12771 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012773 int r;
12774 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012777 PyObject *temp = NULL;
12778 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012780 _PyAccu acc;
12781 static PyObject *plus, *minus, *blank, *zero, *percent;
12782
12783 if (!plus && !(plus = get_latin1_char('+')))
12784 return NULL;
12785 if (!minus && !(minus = get_latin1_char('-')))
12786 return NULL;
12787 if (!blank && !(blank = get_latin1_char(' ')))
12788 return NULL;
12789 if (!zero && !(zero = get_latin1_char('0')))
12790 return NULL;
12791 if (!percent && !(percent = get_latin1_char('%')))
12792 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012793
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012795 PyErr_BadInternalCall();
12796 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12799 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012801 if (_PyAccu_Init(&acc))
12802 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 fmt = PyUnicode_DATA(uformat);
12804 fmtkind = PyUnicode_KIND(uformat);
12805 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12806 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012809 arglen = PyTuple_Size(args);
12810 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811 }
12812 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012813 arglen = -1;
12814 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012816 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012817 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819
12820 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012822 PyObject *nonfmt;
12823 Py_ssize_t nonfmtpos;
12824 nonfmtpos = fmtpos++;
12825 while (fmtcnt >= 0 &&
12826 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12827 fmtpos++;
12828 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012829 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012830 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12831 if (nonfmt == NULL)
12832 goto onError;
12833 r = _PyAccu_Accumulate(&acc, nonfmt);
12834 Py_DECREF(nonfmt);
12835 if (r)
12836 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837 }
12838 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012839 /* Got a format specifier */
12840 int flags = 0;
12841 Py_ssize_t width = -1;
12842 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012844 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 int isnumok;
12846 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012847 void *pbuf = NULL;
12848 Py_ssize_t pindex, len;
12849 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 fmtpos++;
12852 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12853 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 Py_ssize_t keylen;
12855 PyObject *key;
12856 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012857
Benjamin Peterson29060642009-01-31 22:14:21 +000012858 if (dict == NULL) {
12859 PyErr_SetString(PyExc_TypeError,
12860 "format requires a mapping");
12861 goto onError;
12862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012863 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012865 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 /* Skip over balanced parentheses */
12867 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012868 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012869 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012871 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012872 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012874 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012875 if (fmtcnt < 0 || pcount > 0) {
12876 PyErr_SetString(PyExc_ValueError,
12877 "incomplete format key");
12878 goto onError;
12879 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012880 key = PyUnicode_Substring((PyObject*)uformat,
12881 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012882 if (key == NULL)
12883 goto onError;
12884 if (args_owned) {
12885 Py_DECREF(args);
12886 args_owned = 0;
12887 }
12888 args = PyObject_GetItem(dict, key);
12889 Py_DECREF(key);
12890 if (args == NULL) {
12891 goto onError;
12892 }
12893 args_owned = 1;
12894 arglen = -1;
12895 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012896 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012899 case '-': flags |= F_LJUST; continue;
12900 case '+': flags |= F_SIGN; continue;
12901 case ' ': flags |= F_BLANK; continue;
12902 case '#': flags |= F_ALT; continue;
12903 case '0': flags |= F_ZERO; continue;
12904 }
12905 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012906 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 if (c == '*') {
12908 v = getnextarg(args, arglen, &argidx);
12909 if (v == NULL)
12910 goto onError;
12911 if (!PyLong_Check(v)) {
12912 PyErr_SetString(PyExc_TypeError,
12913 "* wants int");
12914 goto onError;
12915 }
12916 width = PyLong_AsLong(v);
12917 if (width == -1 && PyErr_Occurred())
12918 goto onError;
12919 if (width < 0) {
12920 flags |= F_LJUST;
12921 width = -width;
12922 }
12923 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012925 }
12926 else if (c >= '0' && c <= '9') {
12927 width = c - '0';
12928 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012930 if (c < '0' || c > '9')
12931 break;
12932 if ((width*10) / 10 != width) {
12933 PyErr_SetString(PyExc_ValueError,
12934 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012935 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 }
12937 width = width*10 + (c - '0');
12938 }
12939 }
12940 if (c == '.') {
12941 prec = 0;
12942 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 if (c == '*') {
12945 v = getnextarg(args, arglen, &argidx);
12946 if (v == NULL)
12947 goto onError;
12948 if (!PyLong_Check(v)) {
12949 PyErr_SetString(PyExc_TypeError,
12950 "* wants int");
12951 goto onError;
12952 }
12953 prec = PyLong_AsLong(v);
12954 if (prec == -1 && PyErr_Occurred())
12955 goto onError;
12956 if (prec < 0)
12957 prec = 0;
12958 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012960 }
12961 else if (c >= '0' && c <= '9') {
12962 prec = c - '0';
12963 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012965 if (c < '0' || c > '9')
12966 break;
12967 if ((prec*10) / 10 != prec) {
12968 PyErr_SetString(PyExc_ValueError,
12969 "prec too big");
12970 goto onError;
12971 }
12972 prec = prec*10 + (c - '0');
12973 }
12974 }
12975 } /* prec */
12976 if (fmtcnt >= 0) {
12977 if (c == 'h' || c == 'l' || c == 'L') {
12978 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012980 }
12981 }
12982 if (fmtcnt < 0) {
12983 PyErr_SetString(PyExc_ValueError,
12984 "incomplete format");
12985 goto onError;
12986 }
12987 if (c != '%') {
12988 v = getnextarg(args, arglen, &argidx);
12989 if (v == NULL)
12990 goto onError;
12991 }
12992 sign = 0;
12993 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012994 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000012995 switch (c) {
12996
12997 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012998 _PyAccu_Accumulate(&acc, percent);
12999 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013000
13001 case 's':
13002 case 'r':
13003 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013004 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013005 temp = v;
13006 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 }
13008 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013009 if (c == 's')
13010 temp = PyObject_Str(v);
13011 else if (c == 'r')
13012 temp = PyObject_Repr(v);
13013 else
13014 temp = PyObject_ASCII(v);
13015 if (temp == NULL)
13016 goto onError;
13017 if (PyUnicode_Check(temp))
13018 /* nothing to do */;
13019 else {
13020 Py_DECREF(temp);
13021 PyErr_SetString(PyExc_TypeError,
13022 "%s argument has non-string str()");
13023 goto onError;
13024 }
13025 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 if (PyUnicode_READY(temp) == -1) {
13027 Py_CLEAR(temp);
13028 goto onError;
13029 }
13030 pbuf = PyUnicode_DATA(temp);
13031 kind = PyUnicode_KIND(temp);
13032 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013033 if (prec >= 0 && len > prec)
13034 len = prec;
13035 break;
13036
13037 case 'i':
13038 case 'd':
13039 case 'u':
13040 case 'o':
13041 case 'x':
13042 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013043 isnumok = 0;
13044 if (PyNumber_Check(v)) {
13045 PyObject *iobj=NULL;
13046
13047 if (PyLong_Check(v)) {
13048 iobj = v;
13049 Py_INCREF(iobj);
13050 }
13051 else {
13052 iobj = PyNumber_Long(v);
13053 }
13054 if (iobj!=NULL) {
13055 if (PyLong_Check(iobj)) {
13056 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013057 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 Py_DECREF(iobj);
13059 if (!temp)
13060 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 if (PyUnicode_READY(temp) == -1) {
13062 Py_CLEAR(temp);
13063 goto onError;
13064 }
13065 pbuf = PyUnicode_DATA(temp);
13066 kind = PyUnicode_KIND(temp);
13067 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013068 sign = 1;
13069 }
13070 else {
13071 Py_DECREF(iobj);
13072 }
13073 }
13074 }
13075 if (!isnumok) {
13076 PyErr_Format(PyExc_TypeError,
13077 "%%%c format: a number is required, "
13078 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13079 goto onError;
13080 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013081 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013082 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013083 fillobj = zero;
13084 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 break;
13086
13087 case 'e':
13088 case 'E':
13089 case 'f':
13090 case 'F':
13091 case 'g':
13092 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013093 temp = formatfloat(v, flags, prec, c);
13094 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013095 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 if (PyUnicode_READY(temp) == -1) {
13097 Py_CLEAR(temp);
13098 goto onError;
13099 }
13100 pbuf = PyUnicode_DATA(temp);
13101 kind = PyUnicode_KIND(temp);
13102 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013103 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013104 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013106 fillobj = zero;
13107 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 break;
13109
13110 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013111 {
13112 Py_UCS4 ch = formatchar(v);
13113 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013115 temp = _PyUnicode_FromUCS4(&ch, 1);
13116 if (temp == NULL)
13117 goto onError;
13118 pbuf = PyUnicode_DATA(temp);
13119 kind = PyUnicode_KIND(temp);
13120 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013121 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013122 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013123
13124 default:
13125 PyErr_Format(PyExc_ValueError,
13126 "unsupported format character '%c' (0x%x) "
13127 "at index %zd",
13128 (31<=c && c<=126) ? (char)c : '?',
13129 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 goto onError;
13132 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 /* pbuf is initialized here. */
13134 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013135 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013136 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13137 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013138 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013139 pindex++;
13140 }
13141 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13142 signobj = plus;
13143 len--;
13144 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013145 }
13146 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013147 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013148 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013149 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013150 else
13151 sign = 0;
13152 }
13153 if (width < len)
13154 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013156 if (fill != ' ') {
13157 assert(signobj != NULL);
13158 if (_PyAccu_Accumulate(&acc, signobj))
13159 goto onError;
13160 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013161 if (width > len)
13162 width--;
13163 }
13164 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013166 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013168 second = get_latin1_char(
13169 PyUnicode_READ(kind, pbuf, pindex + 1));
13170 pindex += 2;
13171 if (second == NULL ||
13172 _PyAccu_Accumulate(&acc, zero) ||
13173 _PyAccu_Accumulate(&acc, second))
13174 goto onError;
13175 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013176 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 width -= 2;
13178 if (width < 0)
13179 width = 0;
13180 len -= 2;
13181 }
13182 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013183 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013184 if (repeat_accumulate(&acc, fillobj, width - len))
13185 goto onError;
13186 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013187 }
13188 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013189 if (sign) {
13190 assert(signobj != NULL);
13191 if (_PyAccu_Accumulate(&acc, signobj))
13192 goto onError;
13193 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13196 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013197 second = get_latin1_char(
13198 PyUnicode_READ(kind, pbuf, pindex + 1));
13199 pindex += 2;
13200 if (second == NULL ||
13201 _PyAccu_Accumulate(&acc, zero) ||
13202 _PyAccu_Accumulate(&acc, second))
13203 goto onError;
13204 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013205 }
13206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013208 if (temp != NULL) {
13209 assert(pbuf == PyUnicode_DATA(temp));
13210 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013212 else {
13213 const char *p = (const char *) pbuf;
13214 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013215 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 v = PyUnicode_FromKindAndData(kind, p, len);
13217 }
13218 if (v == NULL)
13219 goto onError;
13220 r = _PyAccu_Accumulate(&acc, v);
13221 Py_DECREF(v);
13222 if (r)
13223 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013224 if (width > len && repeat_accumulate(&acc, blank, width - len))
13225 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013226 if (dict && (argidx < arglen) && c != '%') {
13227 PyErr_SetString(PyExc_TypeError,
13228 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 goto onError;
13230 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013231 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013233 } /* until end */
13234 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013235 PyErr_SetString(PyExc_TypeError,
13236 "not all arguments converted during string formatting");
13237 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238 }
13239
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013240 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013242 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013243 }
13244 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013245 Py_XDECREF(temp);
13246 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247 return (PyObject *)result;
13248
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013251 Py_XDECREF(temp);
13252 Py_XDECREF(second);
13253 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 }
13257 return NULL;
13258}
13259
Jeremy Hylton938ace62002-07-17 16:30:39 +000013260static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013261unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13262
Tim Peters6d6c1a32001-08-02 04:15:00 +000013263static PyObject *
13264unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13265{
Benjamin Peterson29060642009-01-31 22:14:21 +000013266 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013267 static char *kwlist[] = {"object", "encoding", "errors", 0};
13268 char *encoding = NULL;
13269 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013270
Benjamin Peterson14339b62009-01-31 16:36:08 +000013271 if (type != &PyUnicode_Type)
13272 return unicode_subtype_new(type, args, kwds);
13273 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 return NULL;
13276 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013278 if (encoding == NULL && errors == NULL)
13279 return PyObject_Str(x);
13280 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013282}
13283
Guido van Rossume023fe02001-08-30 03:12:59 +000013284static PyObject *
13285unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13286{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013287 PyUnicodeObject *unicode, *self;
13288 Py_ssize_t length, char_size;
13289 int share_wstr, share_utf8;
13290 unsigned int kind;
13291 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013292
Benjamin Peterson14339b62009-01-31 16:36:08 +000013293 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013294
13295 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13296 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013297 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013298 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013299 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013300 return NULL;
13301
13302 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13303 if (self == NULL) {
13304 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013305 return NULL;
13306 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013307 kind = PyUnicode_KIND(unicode);
13308 length = PyUnicode_GET_LENGTH(unicode);
13309
13310 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013311#ifdef Py_DEBUG
13312 _PyUnicode_HASH(self) = -1;
13313#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013314 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013315#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013316 _PyUnicode_STATE(self).interned = 0;
13317 _PyUnicode_STATE(self).kind = kind;
13318 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013319 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013320 _PyUnicode_STATE(self).ready = 1;
13321 _PyUnicode_WSTR(self) = NULL;
13322 _PyUnicode_UTF8_LENGTH(self) = 0;
13323 _PyUnicode_UTF8(self) = NULL;
13324 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013325 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013326
13327 share_utf8 = 0;
13328 share_wstr = 0;
13329 if (kind == PyUnicode_1BYTE_KIND) {
13330 char_size = 1;
13331 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13332 share_utf8 = 1;
13333 }
13334 else if (kind == PyUnicode_2BYTE_KIND) {
13335 char_size = 2;
13336 if (sizeof(wchar_t) == 2)
13337 share_wstr = 1;
13338 }
13339 else {
13340 assert(kind == PyUnicode_4BYTE_KIND);
13341 char_size = 4;
13342 if (sizeof(wchar_t) == 4)
13343 share_wstr = 1;
13344 }
13345
13346 /* Ensure we won't overflow the length. */
13347 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13348 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013349 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013350 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013351 data = PyObject_MALLOC((length + 1) * char_size);
13352 if (data == NULL) {
13353 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013354 goto onError;
13355 }
13356
Victor Stinnerc3c74152011-10-02 20:39:55 +020013357 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013358 if (share_utf8) {
13359 _PyUnicode_UTF8_LENGTH(self) = length;
13360 _PyUnicode_UTF8(self) = data;
13361 }
13362 if (share_wstr) {
13363 _PyUnicode_WSTR_LENGTH(self) = length;
13364 _PyUnicode_WSTR(self) = (wchar_t *)data;
13365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013367 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013368 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013369 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013370 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013371#ifdef Py_DEBUG
13372 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13373#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013374 return (PyObject *)self;
13375
13376onError:
13377 Py_DECREF(unicode);
13378 Py_DECREF(self);
13379 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013380}
13381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013382PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013383 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013384\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013385Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013386encoding defaults to the current default string encoding.\n\
13387errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013388
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013389static PyObject *unicode_iter(PyObject *seq);
13390
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013392 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013393 "str", /* tp_name */
13394 sizeof(PyUnicodeObject), /* tp_size */
13395 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 (destructor)unicode_dealloc, /* tp_dealloc */
13398 0, /* tp_print */
13399 0, /* tp_getattr */
13400 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013401 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013402 unicode_repr, /* tp_repr */
13403 &unicode_as_number, /* tp_as_number */
13404 &unicode_as_sequence, /* tp_as_sequence */
13405 &unicode_as_mapping, /* tp_as_mapping */
13406 (hashfunc) unicode_hash, /* tp_hash*/
13407 0, /* tp_call*/
13408 (reprfunc) unicode_str, /* tp_str */
13409 PyObject_GenericGetAttr, /* tp_getattro */
13410 0, /* tp_setattro */
13411 0, /* tp_as_buffer */
13412 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013413 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013414 unicode_doc, /* tp_doc */
13415 0, /* tp_traverse */
13416 0, /* tp_clear */
13417 PyUnicode_RichCompare, /* tp_richcompare */
13418 0, /* tp_weaklistoffset */
13419 unicode_iter, /* tp_iter */
13420 0, /* tp_iternext */
13421 unicode_methods, /* tp_methods */
13422 0, /* tp_members */
13423 0, /* tp_getset */
13424 &PyBaseObject_Type, /* tp_base */
13425 0, /* tp_dict */
13426 0, /* tp_descr_get */
13427 0, /* tp_descr_set */
13428 0, /* tp_dictoffset */
13429 0, /* tp_init */
13430 0, /* tp_alloc */
13431 unicode_new, /* tp_new */
13432 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433};
13434
13435/* Initialize the Unicode implementation */
13436
Thomas Wouters78890102000-07-22 19:25:51 +000013437void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013439 int i;
13440
Thomas Wouters477c8d52006-05-27 19:21:47 +000013441 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013443 0x000A, /* LINE FEED */
13444 0x000D, /* CARRIAGE RETURN */
13445 0x001C, /* FILE SEPARATOR */
13446 0x001D, /* GROUP SEPARATOR */
13447 0x001E, /* RECORD SEPARATOR */
13448 0x0085, /* NEXT LINE */
13449 0x2028, /* LINE SEPARATOR */
13450 0x2029, /* PARAGRAPH SEPARATOR */
13451 };
13452
Fred Drakee4315f52000-05-09 19:53:39 +000013453 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013454 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013455 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013456 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013457 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013459 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013461 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013462 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013463
13464 /* initialize the linebreak bloom filter */
13465 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013467 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013468
13469 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013470}
13471
13472/* Finalize the Unicode implementation */
13473
Christian Heimesa156e092008-02-16 07:38:31 +000013474int
13475PyUnicode_ClearFreeList(void)
13476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013478}
13479
Guido van Rossumd57fd912000-03-10 22:53:23 +000013480void
Thomas Wouters78890102000-07-22 19:25:51 +000013481_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013482{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013483 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013484
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013485 Py_XDECREF(unicode_empty);
13486 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013487
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013488 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 if (unicode_latin1[i]) {
13490 Py_DECREF(unicode_latin1[i]);
13491 unicode_latin1[i] = NULL;
13492 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013493 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013494 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013495 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013496}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013497
Walter Dörwald16807132007-05-25 13:52:07 +000013498void
13499PyUnicode_InternInPlace(PyObject **p)
13500{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013501 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13502 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013503#ifdef Py_DEBUG
13504 assert(s != NULL);
13505 assert(_PyUnicode_CHECK(s));
13506#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013507 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013508 return;
13509#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013510 /* If it's a subclass, we don't really know what putting
13511 it in the interned dict might do. */
13512 if (!PyUnicode_CheckExact(s))
13513 return;
13514 if (PyUnicode_CHECK_INTERNED(s))
13515 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013516 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013517 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013518 return;
13519 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013520 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013521 if (interned == NULL) {
13522 interned = PyDict_New();
13523 if (interned == NULL) {
13524 PyErr_Clear(); /* Don't leave an exception */
13525 return;
13526 }
13527 }
13528 /* It might be that the GetItem call fails even
13529 though the key is present in the dictionary,
13530 namely when this happens during a stack overflow. */
13531 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013533 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013534
Benjamin Peterson29060642009-01-31 22:14:21 +000013535 if (t) {
13536 Py_INCREF(t);
13537 Py_DECREF(*p);
13538 *p = t;
13539 return;
13540 }
Walter Dörwald16807132007-05-25 13:52:07 +000013541
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 PyThreadState_GET()->recursion_critical = 1;
13543 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13544 PyErr_Clear();
13545 PyThreadState_GET()->recursion_critical = 0;
13546 return;
13547 }
13548 PyThreadState_GET()->recursion_critical = 0;
13549 /* The two references in interned are not counted by refcnt.
13550 The deallocator will take care of this */
13551 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013552 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013553}
13554
13555void
13556PyUnicode_InternImmortal(PyObject **p)
13557{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013558 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13559
Benjamin Peterson14339b62009-01-31 16:36:08 +000013560 PyUnicode_InternInPlace(p);
13561 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013562 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013563 Py_INCREF(*p);
13564 }
Walter Dörwald16807132007-05-25 13:52:07 +000013565}
13566
13567PyObject *
13568PyUnicode_InternFromString(const char *cp)
13569{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013570 PyObject *s = PyUnicode_FromString(cp);
13571 if (s == NULL)
13572 return NULL;
13573 PyUnicode_InternInPlace(&s);
13574 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013575}
13576
Alexander Belopolsky40018472011-02-26 01:02:56 +000013577void
13578_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013579{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013580 PyObject *keys;
13581 PyUnicodeObject *s;
13582 Py_ssize_t i, n;
13583 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013584
Benjamin Peterson14339b62009-01-31 16:36:08 +000013585 if (interned == NULL || !PyDict_Check(interned))
13586 return;
13587 keys = PyDict_Keys(interned);
13588 if (keys == NULL || !PyList_Check(keys)) {
13589 PyErr_Clear();
13590 return;
13591 }
Walter Dörwald16807132007-05-25 13:52:07 +000013592
Benjamin Peterson14339b62009-01-31 16:36:08 +000013593 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13594 detector, interned unicode strings are not forcibly deallocated;
13595 rather, we give them their stolen references back, and then clear
13596 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013597
Benjamin Peterson14339b62009-01-31 16:36:08 +000013598 n = PyList_GET_SIZE(keys);
13599 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013600 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013601 for (i = 0; i < n; i++) {
13602 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013603 if (PyUnicode_READY(s) == -1) {
13604 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013605 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013608 case SSTATE_NOT_INTERNED:
13609 /* XXX Shouldn't happen */
13610 break;
13611 case SSTATE_INTERNED_IMMORTAL:
13612 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013614 break;
13615 case SSTATE_INTERNED_MORTAL:
13616 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013618 break;
13619 default:
13620 Py_FatalError("Inconsistent interned string state.");
13621 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013622 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013623 }
13624 fprintf(stderr, "total size of all interned strings: "
13625 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13626 "mortal/immortal\n", mortal_size, immortal_size);
13627 Py_DECREF(keys);
13628 PyDict_Clear(interned);
13629 Py_DECREF(interned);
13630 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013631}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013632
13633
13634/********************* Unicode Iterator **************************/
13635
13636typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013637 PyObject_HEAD
13638 Py_ssize_t it_index;
13639 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013640} unicodeiterobject;
13641
13642static void
13643unicodeiter_dealloc(unicodeiterobject *it)
13644{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013645 _PyObject_GC_UNTRACK(it);
13646 Py_XDECREF(it->it_seq);
13647 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013648}
13649
13650static int
13651unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013653 Py_VISIT(it->it_seq);
13654 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013655}
13656
13657static PyObject *
13658unicodeiter_next(unicodeiterobject *it)
13659{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013660 PyUnicodeObject *seq;
13661 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013662
Benjamin Peterson14339b62009-01-31 16:36:08 +000013663 assert(it != NULL);
13664 seq = it->it_seq;
13665 if (seq == NULL)
13666 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013667 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013668
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13670 int kind = PyUnicode_KIND(seq);
13671 void *data = PyUnicode_DATA(seq);
13672 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13673 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013674 if (item != NULL)
13675 ++it->it_index;
13676 return item;
13677 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013678
Benjamin Peterson14339b62009-01-31 16:36:08 +000013679 Py_DECREF(seq);
13680 it->it_seq = NULL;
13681 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013682}
13683
13684static PyObject *
13685unicodeiter_len(unicodeiterobject *it)
13686{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013687 Py_ssize_t len = 0;
13688 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013689 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013690 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013691}
13692
13693PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13694
13695static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013696 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013697 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013698 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013699};
13700
13701PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013702 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13703 "str_iterator", /* tp_name */
13704 sizeof(unicodeiterobject), /* tp_basicsize */
13705 0, /* tp_itemsize */
13706 /* methods */
13707 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13708 0, /* tp_print */
13709 0, /* tp_getattr */
13710 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013711 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013712 0, /* tp_repr */
13713 0, /* tp_as_number */
13714 0, /* tp_as_sequence */
13715 0, /* tp_as_mapping */
13716 0, /* tp_hash */
13717 0, /* tp_call */
13718 0, /* tp_str */
13719 PyObject_GenericGetAttr, /* tp_getattro */
13720 0, /* tp_setattro */
13721 0, /* tp_as_buffer */
13722 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13723 0, /* tp_doc */
13724 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13725 0, /* tp_clear */
13726 0, /* tp_richcompare */
13727 0, /* tp_weaklistoffset */
13728 PyObject_SelfIter, /* tp_iter */
13729 (iternextfunc)unicodeiter_next, /* tp_iternext */
13730 unicodeiter_methods, /* tp_methods */
13731 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013732};
13733
13734static PyObject *
13735unicode_iter(PyObject *seq)
13736{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013737 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013738
Benjamin Peterson14339b62009-01-31 16:36:08 +000013739 if (!PyUnicode_Check(seq)) {
13740 PyErr_BadInternalCall();
13741 return NULL;
13742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013743 if (PyUnicode_READY(seq) == -1)
13744 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13746 if (it == NULL)
13747 return NULL;
13748 it->it_index = 0;
13749 Py_INCREF(seq);
13750 it->it_seq = (PyUnicodeObject *)seq;
13751 _PyObject_GC_TRACK(it);
13752 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013753}
13754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013755#define UNIOP(x) Py_UNICODE_##x
13756#define UNIOP_t Py_UNICODE
13757#include "uniops.h"
13758#undef UNIOP
13759#undef UNIOP_t
13760#define UNIOP(x) Py_UCS4_##x
13761#define UNIOP_t Py_UCS4
13762#include "uniops.h"
13763#undef UNIOP
13764#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013765
Victor Stinner71133ff2010-09-01 23:43:53 +000013766Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013767PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013768{
13769 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020013770 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000013771 Py_ssize_t size;
13772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773 if (!PyUnicode_Check(unicode)) {
13774 PyErr_BadArgument();
13775 return NULL;
13776 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013777 u = PyUnicode_AsUnicode(object);
13778 if (u == NULL)
13779 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000013780 /* Ensure we won't overflow the size. */
13781 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13782 PyErr_NoMemory();
13783 return NULL;
13784 }
13785 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13786 size *= sizeof(Py_UNICODE);
13787 copy = PyMem_Malloc(size);
13788 if (copy == NULL) {
13789 PyErr_NoMemory();
13790 return NULL;
13791 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013792 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000013793 return copy;
13794}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013795
Georg Brandl66c221e2010-10-14 07:04:07 +000013796/* A _string module, to export formatter_parser and formatter_field_name_split
13797 to the string.Formatter class implemented in Python. */
13798
13799static PyMethodDef _string_methods[] = {
13800 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13801 METH_O, PyDoc_STR("split the argument as a field name")},
13802 {"formatter_parser", (PyCFunction) formatter_parser,
13803 METH_O, PyDoc_STR("parse the argument as a format string")},
13804 {NULL, NULL}
13805};
13806
13807static struct PyModuleDef _string_module = {
13808 PyModuleDef_HEAD_INIT,
13809 "_string",
13810 PyDoc_STR("string helper module"),
13811 0,
13812 _string_methods,
13813 NULL,
13814 NULL,
13815 NULL,
13816 NULL
13817};
13818
13819PyMODINIT_FUNC
13820PyInit__string(void)
13821{
13822 return PyModule_Create(&_string_module);
13823}
13824
13825
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013826#ifdef __cplusplus
13827}
13828#endif