blob: b9611299423295804a84c5d59c4db0159b20b797 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Victor Stinner3a50e702011-10-18 21:21:00 +0200432#ifdef HAVE_MBCS
433static OSVERSIONINFOEX winver;
434#endif
435
Thomas Wouters477c8d52006-05-27 19:21:47 +0000436/* --- Bloom Filters ----------------------------------------------------- */
437
438/* stuff to implement simple "bloom filters" for Unicode characters.
439 to keep things simple, we use a single bitmask, using the least 5
440 bits from each unicode characters as the bit index. */
441
442/* the linebreak mask is set up by Unicode_Init below */
443
Antoine Pitrouf068f942010-01-13 14:19:12 +0000444#if LONG_BIT >= 128
445#define BLOOM_WIDTH 128
446#elif LONG_BIT >= 64
447#define BLOOM_WIDTH 64
448#elif LONG_BIT >= 32
449#define BLOOM_WIDTH 32
450#else
451#error "LONG_BIT is smaller than 32"
452#endif
453
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454#define BLOOM_MASK unsigned long
455
456static BLOOM_MASK bloom_linebreak;
457
Antoine Pitrouf068f942010-01-13 14:19:12 +0000458#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
459#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Benjamin Peterson29060642009-01-31 22:14:21 +0000461#define BLOOM_LINEBREAK(ch) \
462 ((ch) < 128U ? ascii_linebreak[(ch)] : \
463 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000464
Alexander Belopolsky40018472011-02-26 01:02:56 +0000465Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467{
468 /* calculate simple bloom-style bitmask for a given unicode string */
469
Antoine Pitrouf068f942010-01-13 14:19:12 +0000470 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000471 Py_ssize_t i;
472
473 mask = 0;
474 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000476
477 return mask;
478}
479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480#define BLOOM_MEMBER(mask, chr, str) \
481 (BLOOM(mask, chr) \
482 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000483
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200484/* Compilation of templated routines */
485
486#include "stringlib/asciilib.h"
487#include "stringlib/fastsearch.h"
488#include "stringlib/partition.h"
489#include "stringlib/split.h"
490#include "stringlib/count.h"
491#include "stringlib/find.h"
492#include "stringlib/find_max_char.h"
493#include "stringlib/localeutil.h"
494#include "stringlib/undef.h"
495
496#include "stringlib/ucs1lib.h"
497#include "stringlib/fastsearch.h"
498#include "stringlib/partition.h"
499#include "stringlib/split.h"
500#include "stringlib/count.h"
501#include "stringlib/find.h"
502#include "stringlib/find_max_char.h"
503#include "stringlib/localeutil.h"
504#include "stringlib/undef.h"
505
506#include "stringlib/ucs2lib.h"
507#include "stringlib/fastsearch.h"
508#include "stringlib/partition.h"
509#include "stringlib/split.h"
510#include "stringlib/count.h"
511#include "stringlib/find.h"
512#include "stringlib/find_max_char.h"
513#include "stringlib/localeutil.h"
514#include "stringlib/undef.h"
515
516#include "stringlib/ucs4lib.h"
517#include "stringlib/fastsearch.h"
518#include "stringlib/partition.h"
519#include "stringlib/split.h"
520#include "stringlib/count.h"
521#include "stringlib/find.h"
522#include "stringlib/find_max_char.h"
523#include "stringlib/localeutil.h"
524#include "stringlib/undef.h"
525
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200526#include "stringlib/unicodedefs.h"
527#include "stringlib/fastsearch.h"
528#include "stringlib/count.h"
529#include "stringlib/find.h"
530
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531/* --- Unicode Object ----------------------------------------------------- */
532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200534fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
537 Py_ssize_t size, Py_UCS4 ch,
538 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200540 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
541
542 switch (kind) {
543 case PyUnicode_1BYTE_KIND:
544 {
545 Py_UCS1 ch1 = (Py_UCS1) ch;
546 if (ch1 == ch)
547 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
548 else
549 return -1;
550 }
551 case PyUnicode_2BYTE_KIND:
552 {
553 Py_UCS2 ch2 = (Py_UCS2) ch;
554 if (ch2 == ch)
555 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
556 else
557 return -1;
558 }
559 case PyUnicode_4BYTE_KIND:
560 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
561 default:
562 assert(0);
563 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565}
566
Victor Stinnerfe226c02011-10-03 03:52:20 +0200567static PyObject*
568resize_compact(PyObject *unicode, Py_ssize_t length)
569{
570 Py_ssize_t char_size;
571 Py_ssize_t struct_size;
572 Py_ssize_t new_size;
573 int share_wstr;
574
575 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200576 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 if (PyUnicode_IS_COMPACT_ASCII(unicode))
578 struct_size = sizeof(PyASCIIObject);
579 else
580 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200581 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582
583 _Py_DEC_REFTOTAL;
584 _Py_ForgetReference(unicode);
585
586 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
587 PyErr_NoMemory();
588 return NULL;
589 }
590 new_size = (struct_size + (length + 1) * char_size);
591
592 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
593 if (unicode == NULL) {
594 PyObject_Del(unicode);
595 PyErr_NoMemory();
596 return NULL;
597 }
598 _Py_NewReference(unicode);
599 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200600 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200602 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
603 _PyUnicode_WSTR_LENGTH(unicode) = length;
604 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200605 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
606 length, 0);
607 return unicode;
608}
609
Alexander Belopolsky40018472011-02-26 01:02:56 +0000610static int
Victor Stinner95663112011-10-04 01:03:50 +0200611resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612{
Victor Stinner95663112011-10-04 01:03:50 +0200613 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000616
Victor Stinner95663112011-10-04 01:03:50 +0200617 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618
619 if (PyUnicode_IS_READY(unicode)) {
620 Py_ssize_t char_size;
621 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200622 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200623 void *data;
624
625 data = _PyUnicode_DATA_ANY(unicode);
626 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200627 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
629 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200630 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
631 {
632 PyObject_DEL(_PyUnicode_UTF8(unicode));
633 _PyUnicode_UTF8(unicode) = NULL;
634 _PyUnicode_UTF8_LENGTH(unicode) = 0;
635 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636
637 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
638 PyErr_NoMemory();
639 return -1;
640 }
641 new_size = (length + 1) * char_size;
642
643 data = (PyObject *)PyObject_REALLOC(data, new_size);
644 if (data == NULL) {
645 PyErr_NoMemory();
646 return -1;
647 }
648 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200649 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200650 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_WSTR_LENGTH(unicode) = length;
652 }
653 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200654 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 _PyUnicode_UTF8_LENGTH(unicode) = length;
656 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 _PyUnicode_LENGTH(unicode) = length;
658 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200659 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200660 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 }
Victor Stinner95663112011-10-04 01:03:50 +0200664 assert(_PyUnicode_WSTR(unicode) != NULL);
665
666 /* check for integer overflow */
667 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
668 PyErr_NoMemory();
669 return -1;
670 }
671 wstr = _PyUnicode_WSTR(unicode);
672 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
673 if (!wstr) {
674 PyErr_NoMemory();
675 return -1;
676 }
677 _PyUnicode_WSTR(unicode) = wstr;
678 _PyUnicode_WSTR(unicode)[length] = 0;
679 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200680 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681 return 0;
682}
683
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684static PyObject*
685resize_copy(PyObject *unicode, Py_ssize_t length)
686{
687 Py_ssize_t copy_length;
688 if (PyUnicode_IS_COMPACT(unicode)) {
689 PyObject *copy;
690 assert(PyUnicode_IS_READY(unicode));
691
692 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
693 if (copy == NULL)
694 return NULL;
695
696 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200697 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200699 }
700 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200701 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(_PyUnicode_WSTR(unicode) != NULL);
703 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200704 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 if (w == NULL)
706 return NULL;
707 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
708 copy_length = Py_MIN(copy_length, length);
709 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
710 copy_length);
711 return (PyObject*)w;
712 }
713}
714
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000716 Ux0000 terminated; some code (e.g. new_identifier)
717 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000720 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721
722*/
723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200725static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#endif
727
Alexander Belopolsky40018472011-02-26 01:02:56 +0000728static PyUnicodeObject *
729_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730{
731 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000733
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 if (length == 0 && unicode_empty != NULL) {
736 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200737 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000738 }
739
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000740 /* Ensure we won't overflow the size. */
741 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
742 return (PyUnicodeObject *)PyErr_NoMemory();
743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 if (length < 0) {
745 PyErr_SetString(PyExc_SystemError,
746 "Negative size passed to _PyUnicode_New");
747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748 }
749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750#ifdef Py_DEBUG
751 ++unicode_old_new_calls;
752#endif
753
754 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
755 if (unicode == NULL)
756 return NULL;
757 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
758 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
759 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000760 PyErr_NoMemory();
761 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763
Jeremy Hyltond8082792003-09-16 19:41:39 +0000764 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000765 * the caller fails before initializing str -- unicode_resize()
766 * reads str[0], and the Keep-Alive optimization can keep memory
767 * allocated for str alive across a call to unicode_dealloc(unicode).
768 * We don't want unicode_resize to read uninitialized memory in
769 * that case.
770 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200771 _PyUnicode_WSTR(unicode)[0] = 0;
772 _PyUnicode_WSTR(unicode)[length] = 0;
773 _PyUnicode_WSTR_LENGTH(unicode) = length;
774 _PyUnicode_HASH(unicode) = -1;
775 _PyUnicode_STATE(unicode).interned = 0;
776 _PyUnicode_STATE(unicode).kind = 0;
777 _PyUnicode_STATE(unicode).compact = 0;
778 _PyUnicode_STATE(unicode).ready = 0;
779 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200780 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200781 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200782 _PyUnicode_UTF8(unicode) = NULL;
783 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner67072932011-10-18 22:10:14 +0200784 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000786
Benjamin Peterson29060642009-01-31 22:14:21 +0000787 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000788 /* XXX UNREF/NEWREF interface should be more symmetrical */
789 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000790 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000791 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793}
794
Victor Stinnerf42dc442011-10-02 23:33:16 +0200795static const char*
796unicode_kind_name(PyObject *unicode)
797{
Victor Stinner42dfd712011-10-03 14:41:45 +0200798 /* don't check consistency: unicode_kind_name() is called from
799 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200800 if (!PyUnicode_IS_COMPACT(unicode))
801 {
802 if (!PyUnicode_IS_READY(unicode))
803 return "wstr";
804 switch(PyUnicode_KIND(unicode))
805 {
806 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200807 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200808 return "legacy ascii";
809 else
810 return "legacy latin1";
811 case PyUnicode_2BYTE_KIND:
812 return "legacy UCS2";
813 case PyUnicode_4BYTE_KIND:
814 return "legacy UCS4";
815 default:
816 return "<legacy invalid kind>";
817 }
818 }
819 assert(PyUnicode_IS_READY(unicode));
820 switch(PyUnicode_KIND(unicode))
821 {
822 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 return "ascii";
825 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200826 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200827 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200828 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200829 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200830 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200831 default:
832 return "<invalid compact kind>";
833 }
834}
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200837static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
839/* Functions wrapping macros for use in debugger */
840char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200841 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842}
843
844void *_PyUnicode_compact_data(void *unicode) {
845 return _PyUnicode_COMPACT_DATA(unicode);
846}
847void *_PyUnicode_data(void *unicode){
848 printf("obj %p\n", unicode);
849 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
850 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
851 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
852 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
853 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
854 return PyUnicode_DATA(unicode);
855}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200856
857void
858_PyUnicode_Dump(PyObject *op)
859{
860 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200861 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
862 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
863 void *data;
864 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
865 if (ascii->state.compact)
866 data = (compact + 1);
867 else
868 data = unicode->data.any;
869 if (ascii->wstr == data)
870 printf("shared ");
871 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200872 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 printf(" (%zu), ", compact->wstr_length);
874 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
875 printf("shared ");
876 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200877 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200878 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200879}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880#endif
881
882PyObject *
883PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
884{
885 PyObject *obj;
886 PyCompactUnicodeObject *unicode;
887 void *data;
888 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 Py_ssize_t char_size;
891 Py_ssize_t struct_size;
892
893 /* Optimization for empty strings */
894 if (size == 0 && unicode_empty != NULL) {
895 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200896 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 }
898
899#ifdef Py_DEBUG
900 ++unicode_new_new_calls;
901#endif
902
Victor Stinner9e9d6892011-10-04 01:02:02 +0200903 is_ascii = 0;
904 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 struct_size = sizeof(PyCompactUnicodeObject);
906 if (maxchar < 128) {
907 kind_state = PyUnicode_1BYTE_KIND;
908 char_size = 1;
909 is_ascii = 1;
910 struct_size = sizeof(PyASCIIObject);
911 }
912 else if (maxchar < 256) {
913 kind_state = PyUnicode_1BYTE_KIND;
914 char_size = 1;
915 }
916 else if (maxchar < 65536) {
917 kind_state = PyUnicode_2BYTE_KIND;
918 char_size = 2;
919 if (sizeof(wchar_t) == 2)
920 is_sharing = 1;
921 }
922 else {
923 kind_state = PyUnicode_4BYTE_KIND;
924 char_size = 4;
925 if (sizeof(wchar_t) == 4)
926 is_sharing = 1;
927 }
928
929 /* Ensure we won't overflow the size. */
930 if (size < 0) {
931 PyErr_SetString(PyExc_SystemError,
932 "Negative size passed to PyUnicode_New");
933 return NULL;
934 }
935 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
936 return PyErr_NoMemory();
937
938 /* Duplicated allocation code from _PyObject_New() instead of a call to
939 * PyObject_New() so we are able to allocate space for the object and
940 * it's data buffer.
941 */
942 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
943 if (obj == NULL)
944 return PyErr_NoMemory();
945 obj = PyObject_INIT(obj, &PyUnicode_Type);
946 if (obj == NULL)
947 return NULL;
948
949 unicode = (PyCompactUnicodeObject *)obj;
950 if (is_ascii)
951 data = ((PyASCIIObject*)obj) + 1;
952 else
953 data = unicode + 1;
954 _PyUnicode_LENGTH(unicode) = size;
955 _PyUnicode_HASH(unicode) = -1;
956 _PyUnicode_STATE(unicode).interned = 0;
957 _PyUnicode_STATE(unicode).kind = kind_state;
958 _PyUnicode_STATE(unicode).compact = 1;
959 _PyUnicode_STATE(unicode).ready = 1;
960 _PyUnicode_STATE(unicode).ascii = is_ascii;
961 if (is_ascii) {
962 ((char*)data)[size] = 0;
963 _PyUnicode_WSTR(unicode) = NULL;
964 }
965 else if (kind_state == PyUnicode_1BYTE_KIND) {
966 ((char*)data)[size] = 0;
967 _PyUnicode_WSTR(unicode) = NULL;
968 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200970 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 }
972 else {
973 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 if (kind_state == PyUnicode_2BYTE_KIND)
976 ((Py_UCS2*)data)[size] = 0;
977 else /* kind_state == PyUnicode_4BYTE_KIND */
978 ((Py_UCS4*)data)[size] = 0;
979 if (is_sharing) {
980 _PyUnicode_WSTR_LENGTH(unicode) = size;
981 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
982 }
983 else {
984 _PyUnicode_WSTR_LENGTH(unicode) = 0;
985 _PyUnicode_WSTR(unicode) = NULL;
986 }
987 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200988 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 return obj;
990}
991
992#if SIZEOF_WCHAR_T == 2
993/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
994 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200995 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996
997 This function assumes that unicode can hold one more code point than wstr
998 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200999static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1001 PyUnicodeObject *unicode)
1002{
1003 const wchar_t *iter;
1004 Py_UCS4 *ucs4_out;
1005
Victor Stinner910337b2011-10-03 03:20:16 +02001006 assert(unicode != NULL);
1007 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1009 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1010
1011 for (iter = begin; iter < end; ) {
1012 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1013 _PyUnicode_GET_LENGTH(unicode)));
1014 if (*iter >= 0xD800 && *iter <= 0xDBFF
1015 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1016 {
1017 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1018 iter += 2;
1019 }
1020 else {
1021 *ucs4_out++ = *iter;
1022 iter++;
1023 }
1024 }
1025 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1026 _PyUnicode_GET_LENGTH(unicode)));
1027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028}
1029#endif
1030
Victor Stinnercd9950f2011-10-02 00:34:53 +02001031static int
1032_PyUnicode_Dirty(PyObject *unicode)
1033{
Victor Stinner910337b2011-10-03 03:20:16 +02001034 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001035 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001036 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001037 "Cannot modify a string having more than 1 reference");
1038 return -1;
1039 }
1040 _PyUnicode_DIRTY(unicode);
1041 return 0;
1042}
1043
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044static int
1045_copy_characters(PyObject *to, Py_ssize_t to_start,
1046 PyObject *from, Py_ssize_t from_start,
1047 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001049 unsigned int from_kind, to_kind;
1050 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001051 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 assert(PyUnicode_Check(from));
1054 assert(PyUnicode_Check(to));
1055 assert(PyUnicode_IS_READY(from));
1056 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001058 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1059 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1060 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001062 if (how_many == 0)
1063 return 0;
1064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001066 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001070#ifdef Py_DEBUG
1071 if (!check_maxchar
1072 && (from_kind > to_kind
1073 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001074 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001075 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1076 Py_UCS4 ch;
1077 Py_ssize_t i;
1078 for (i=0; i < how_many; i++) {
1079 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1080 assert(ch <= to_maxchar);
1081 }
1082 }
1083#endif
1084 fast = (from_kind == to_kind);
1085 if (check_maxchar
1086 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1087 {
1088 /* deny latin1 => ascii */
1089 fast = 0;
1090 }
1091
1092 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001093 Py_MEMCPY((char*)to_data + to_kind * to_start,
1094 (char*)from_data + from_kind * from_start,
1095 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001097 else if (from_kind == PyUnicode_1BYTE_KIND
1098 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001099 {
1100 _PyUnicode_CONVERT_BYTES(
1101 Py_UCS1, Py_UCS2,
1102 PyUnicode_1BYTE_DATA(from) + from_start,
1103 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1104 PyUnicode_2BYTE_DATA(to) + to_start
1105 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001106 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001107 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001108 && to_kind == PyUnicode_4BYTE_KIND)
1109 {
1110 _PyUnicode_CONVERT_BYTES(
1111 Py_UCS1, Py_UCS4,
1112 PyUnicode_1BYTE_DATA(from) + from_start,
1113 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1114 PyUnicode_4BYTE_DATA(to) + to_start
1115 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001116 }
1117 else if (from_kind == PyUnicode_2BYTE_KIND
1118 && to_kind == PyUnicode_4BYTE_KIND)
1119 {
1120 _PyUnicode_CONVERT_BYTES(
1121 Py_UCS2, Py_UCS4,
1122 PyUnicode_2BYTE_DATA(from) + from_start,
1123 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1124 PyUnicode_4BYTE_DATA(to) + to_start
1125 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001126 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 /* check if max_char(from substring) <= max_char(to) */
1129 if (from_kind > to_kind
1130 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001131 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001132 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001133 /* slow path to check for character overflow */
1134 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001136 Py_ssize_t i;
1137
Victor Stinner56c161a2011-10-06 02:47:11 +02001138#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001139 for (i=0; i < how_many; i++) {
1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001141 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1143 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001144#else
1145 if (!check_maxchar) {
1146 for (i=0; i < how_many; i++) {
1147 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1148 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1149 }
1150 }
1151 else {
1152 for (i=0; i < how_many; i++) {
1153 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1154 if (ch > to_maxchar)
1155 return 1;
1156 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1157 }
1158 }
1159#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001160 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001162 assert(0 && "inconsistent state");
1163 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 }
1165 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166 return 0;
1167}
1168
1169static void
1170copy_characters(PyObject *to, Py_ssize_t to_start,
1171 PyObject *from, Py_ssize_t from_start,
1172 Py_ssize_t how_many)
1173{
1174 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1175}
1176
1177Py_ssize_t
1178PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1179 PyObject *from, Py_ssize_t from_start,
1180 Py_ssize_t how_many)
1181{
1182 int err;
1183
1184 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1185 PyErr_BadInternalCall();
1186 return -1;
1187 }
1188
1189 if (PyUnicode_READY(from))
1190 return -1;
1191 if (PyUnicode_READY(to))
1192 return -1;
1193
1194 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1195 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1196 PyErr_Format(PyExc_SystemError,
1197 "Cannot write %zi characters at %zi "
1198 "in a string of %zi characters",
1199 how_many, to_start, PyUnicode_GET_LENGTH(to));
1200 return -1;
1201 }
1202
1203 if (how_many == 0)
1204 return 0;
1205
1206 if (_PyUnicode_Dirty(to))
1207 return -1;
1208
1209 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1210 if (err) {
1211 PyErr_Format(PyExc_SystemError,
1212 "Cannot copy %s characters "
1213 "into a string of %s characters",
1214 unicode_kind_name(from),
1215 unicode_kind_name(to));
1216 return -1;
1217 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219}
1220
Victor Stinner17222162011-09-28 22:15:37 +02001221/* Find the maximum code point and count the number of surrogate pairs so a
1222 correct string length can be computed before converting a string to UCS4.
1223 This function counts single surrogates as a character and not as a pair.
1224
1225 Return 0 on success, or -1 on error. */
1226static int
1227find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1228 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229{
1230 const wchar_t *iter;
1231
Victor Stinnerc53be962011-10-02 21:33:54 +02001232 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 *num_surrogates = 0;
1234 *maxchar = 0;
1235
1236 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001237 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001239#if SIZEOF_WCHAR_T != 2
1240 if (*maxchar >= 0x10000)
1241 return 0;
1242#endif
1243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244#if SIZEOF_WCHAR_T == 2
1245 if (*iter >= 0xD800 && *iter <= 0xDBFF
1246 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1247 {
1248 Py_UCS4 surrogate_val;
1249 surrogate_val = (((iter[0] & 0x3FF)<<10)
1250 | (iter[1] & 0x3FF)) + 0x10000;
1251 ++(*num_surrogates);
1252 if (surrogate_val > *maxchar)
1253 *maxchar = surrogate_val;
1254 iter += 2;
1255 }
1256 else
1257 iter++;
1258#else
1259 iter++;
1260#endif
1261 }
1262 return 0;
1263}
1264
1265#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001266static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267#endif
1268
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001269static int
1270unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001272 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 wchar_t *end;
1274 Py_UCS4 maxchar = 0;
1275 Py_ssize_t num_surrogates;
1276#if SIZEOF_WCHAR_T == 2
1277 Py_ssize_t length_wo_surrogates;
1278#endif
1279
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001280 assert(p_obj != NULL);
1281 unicode = (PyUnicodeObject *)*p_obj;
1282
Georg Brandl7597add2011-10-05 16:36:47 +02001283 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001284 strings were created using _PyObject_New() and where no canonical
1285 representation (the str field) has been set yet aka strings
1286 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001287 assert(_PyUnicode_CHECK(unicode));
1288 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001290 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001291 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001292 /* Actually, it should neither be interned nor be anything else: */
1293 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294
1295#ifdef Py_DEBUG
1296 ++unicode_ready_calls;
1297#endif
1298
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001299#ifdef Py_DEBUG
1300 assert(!replace || Py_REFCNT(unicode) == 1);
1301#else
1302 if (replace && Py_REFCNT(unicode) != 1)
1303 replace = 0;
1304#endif
1305 if (replace) {
1306 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1307 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1308 /* Optimization for empty strings */
1309 if (len == 0) {
1310 Py_INCREF(unicode_empty);
1311 Py_DECREF(*p_obj);
1312 *p_obj = unicode_empty;
1313 return 0;
1314 }
1315 if (len == 1 && wstr[0] < 256) {
1316 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1317 if (latin1_char == NULL)
1318 return -1;
1319 Py_DECREF(*p_obj);
1320 *p_obj = latin1_char;
1321 return 0;
1322 }
1323 }
1324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001326 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001327 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329
1330 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001331 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1332 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 PyErr_NoMemory();
1334 return -1;
1335 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001336 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_WSTR(unicode), end,
1338 PyUnicode_1BYTE_DATA(unicode));
1339 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1340 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1341 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1342 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001343 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001344 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001345 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001349 _PyUnicode_UTF8(unicode) = NULL;
1350 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
1352 PyObject_FREE(_PyUnicode_WSTR(unicode));
1353 _PyUnicode_WSTR(unicode) = NULL;
1354 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1355 }
1356 /* In this case we might have to convert down from 4-byte native
1357 wchar_t to 2-byte unicode. */
1358 else if (maxchar < 65536) {
1359 assert(num_surrogates == 0 &&
1360 "FindMaxCharAndNumSurrogatePairs() messed up");
1361
Victor Stinner506f5922011-09-28 22:34:18 +02001362#if SIZEOF_WCHAR_T == 2
1363 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001365 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1366 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1367 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001368 _PyUnicode_UTF8(unicode) = NULL;
1369 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001370#else
1371 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001372 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001373 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001375 PyErr_NoMemory();
1376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 }
Victor Stinner506f5922011-09-28 22:34:18 +02001378 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1379 _PyUnicode_WSTR(unicode), end,
1380 PyUnicode_2BYTE_DATA(unicode));
1381 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1382 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1383 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001384 _PyUnicode_UTF8(unicode) = NULL;
1385 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001386 PyObject_FREE(_PyUnicode_WSTR(unicode));
1387 _PyUnicode_WSTR(unicode) = NULL;
1388 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1389#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1392 else {
1393#if SIZEOF_WCHAR_T == 2
1394 /* in case the native representation is 2-bytes, we need to allocate a
1395 new normalized 4-byte version. */
1396 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1398 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 PyErr_NoMemory();
1400 return -1;
1401 }
1402 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1403 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001406 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1407 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001408 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 PyObject_FREE(_PyUnicode_WSTR(unicode));
1410 _PyUnicode_WSTR(unicode) = NULL;
1411 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1412#else
1413 assert(num_surrogates == 0);
1414
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1420#endif
1421 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1422 }
1423 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001424 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 return 0;
1426}
1427
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001428int
1429_PyUnicode_ReadyReplace(PyObject **op)
1430{
1431 return unicode_ready(op, 1);
1432}
1433
1434int
1435_PyUnicode_Ready(PyObject *op)
1436{
1437 return unicode_ready(&op, 0);
1438}
1439
Alexander Belopolsky40018472011-02-26 01:02:56 +00001440static void
1441unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442{
Walter Dörwald16807132007-05-25 13:52:07 +00001443 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 case SSTATE_NOT_INTERNED:
1445 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001446
Benjamin Peterson29060642009-01-31 22:14:21 +00001447 case SSTATE_INTERNED_MORTAL:
1448 /* revive dead object temporarily for DelItem */
1449 Py_REFCNT(unicode) = 3;
1450 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1451 Py_FatalError(
1452 "deletion of interned string failed");
1453 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001454
Benjamin Peterson29060642009-01-31 22:14:21 +00001455 case SSTATE_INTERNED_IMMORTAL:
1456 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001457
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 default:
1459 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001460 }
1461
Victor Stinner03490912011-10-03 23:45:12 +02001462 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001464 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466
1467 if (PyUnicode_IS_COMPACT(unicode)) {
1468 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
1470 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001471 if (_PyUnicode_DATA_ANY(unicode))
1472 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001473 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 }
1475}
1476
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001477#ifdef Py_DEBUG
1478static int
1479unicode_is_singleton(PyObject *unicode)
1480{
1481 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1482 if (unicode == unicode_empty)
1483 return 1;
1484 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1485 {
1486 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1487 if (ch < 256 && unicode_latin1[ch] == unicode)
1488 return 1;
1489 }
1490 return 0;
1491}
1492#endif
1493
Alexander Belopolsky40018472011-02-26 01:02:56 +00001494static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001495unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001496{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001497 if (Py_REFCNT(unicode) != 1)
1498 return 0;
1499 if (PyUnicode_CHECK_INTERNED(unicode))
1500 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001501#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001502 /* singleton refcount is greater than 1 */
1503 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001504#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001505 return 1;
1506}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001507
Victor Stinnerfe226c02011-10-03 03:52:20 +02001508static int
1509unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1510{
1511 PyObject *unicode;
1512 Py_ssize_t old_length;
1513
1514 assert(p_unicode != NULL);
1515 unicode = *p_unicode;
1516
1517 assert(unicode != NULL);
1518 assert(PyUnicode_Check(unicode));
1519 assert(0 <= length);
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001522 old_length = PyUnicode_WSTR_LENGTH(unicode);
1523 else
1524 old_length = PyUnicode_GET_LENGTH(unicode);
1525 if (old_length == length)
1526 return 0;
1527
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (!unicode_resizable(unicode)) {
1529 PyObject *copy = resize_copy(unicode, length);
1530 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 Py_DECREF(*p_unicode);
1533 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001535 }
1536
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 if (PyUnicode_IS_COMPACT(unicode)) {
1538 *p_unicode = resize_compact(unicode, length);
1539 if (*p_unicode == NULL)
1540 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001541 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001543 }
1544 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001545}
1546
Alexander Belopolsky40018472011-02-26 01:02:56 +00001547int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001548PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001549{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 PyObject *unicode;
1551 if (p_unicode == NULL) {
1552 PyErr_BadInternalCall();
1553 return -1;
1554 }
1555 unicode = *p_unicode;
1556 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1557 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1558 {
1559 PyErr_BadInternalCall();
1560 return -1;
1561 }
1562 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001563}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565static PyObject*
1566get_latin1_char(unsigned char ch)
1567{
Victor Stinnera464fc12011-10-02 20:39:30 +02001568 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001570 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 if (!unicode)
1572 return NULL;
1573 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001574 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 unicode_latin1[ch] = unicode;
1576 }
1577 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001578 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579}
1580
Alexander Belopolsky40018472011-02-26 01:02:56 +00001581PyObject *
1582PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583{
1584 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 Py_UCS4 maxchar = 0;
1586 Py_ssize_t num_surrogates;
1587
1588 if (u == NULL)
1589 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591 /* If the Unicode data is known at construction time, we can apply
1592 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 /* Optimization for empty strings */
1595 if (size == 0 && unicode_empty != NULL) {
1596 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001597 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598 }
Tim Petersced69f82003-09-16 20:30:58 +00001599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 /* Single character Unicode objects in the Latin-1 range are
1601 shared when using this constructor */
1602 if (size == 1 && *u < 256)
1603 return get_latin1_char((unsigned char)*u);
1604
1605 /* If not empty and not single character, copy the Unicode data
1606 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001607 if (find_maxchar_surrogates(u, u + size,
1608 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001609 return NULL;
1610
1611 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1612 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613 if (!unicode)
1614 return NULL;
1615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 switch (PyUnicode_KIND(unicode)) {
1617 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001618 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1620 break;
1621 case PyUnicode_2BYTE_KIND:
1622#if Py_UNICODE_SIZE == 2
1623 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1624#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001625 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1627#endif
1628 break;
1629 case PyUnicode_4BYTE_KIND:
1630#if SIZEOF_WCHAR_T == 2
1631 /* This is the only case which has to process surrogates, thus
1632 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001633 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634#else
1635 assert(num_surrogates == 0);
1636 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1637#endif
1638 break;
1639 default:
1640 assert(0 && "Impossible state");
1641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001643 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 return (PyObject *)unicode;
1645}
1646
Alexander Belopolsky40018472011-02-26 01:02:56 +00001647PyObject *
1648PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001649{
1650 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001651
Benjamin Peterson14339b62009-01-31 16:36:08 +00001652 if (size < 0) {
1653 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001655 return NULL;
1656 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001657
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001658 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001659 some optimizations which share commonly used objects.
1660 Also, this means the input must be UTF-8, so fall back to the
1661 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001662 if (u != NULL) {
1663
Benjamin Peterson29060642009-01-31 22:14:21 +00001664 /* Optimization for empty strings */
1665 if (size == 0 && unicode_empty != NULL) {
1666 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001667 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001669
1670 /* Single characters are shared when using this constructor.
1671 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 if (size == 1 && Py_CHARMASK(*u) < 128)
1673 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001674
1675 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001676 }
1677
Walter Dörwald55507312007-05-18 13:12:10 +00001678 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001679 if (!unicode)
1680 return NULL;
1681
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001682 return (PyObject *)unicode;
1683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685PyObject *
1686PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001687{
1688 size_t size = strlen(u);
1689 if (size > PY_SSIZE_T_MAX) {
1690 PyErr_SetString(PyExc_OverflowError, "input too long");
1691 return NULL;
1692 }
1693
1694 return PyUnicode_FromStringAndSize(u, size);
1695}
1696
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001697PyObject *
1698_PyUnicode_FromId(_Py_Identifier *id)
1699{
1700 if (!id->object) {
1701 id->object = PyUnicode_FromString(id->string);
1702 if (!id->object)
1703 return NULL;
1704 PyUnicode_InternInPlace(&id->object);
1705 assert(!id->next);
1706 id->next = static_strings;
1707 static_strings = id;
1708 }
1709 Py_INCREF(id->object);
1710 return id->object;
1711}
1712
1713void
1714_PyUnicode_ClearStaticStrings()
1715{
1716 _Py_Identifier *i;
1717 for (i = static_strings; i; i = i->next) {
1718 Py_DECREF(i->object);
1719 i->object = NULL;
1720 i->next = NULL;
1721 }
1722}
1723
Victor Stinnere57b1c02011-09-28 22:20:48 +02001724static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001725unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001726{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001727 PyObject *res;
1728#ifdef Py_DEBUG
1729 const unsigned char *p;
1730 const unsigned char *end = s + size;
1731 for (p=s; p < end; p++) {
1732 assert(*p < 128);
1733 }
1734#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001735 if (size == 1)
1736 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001737 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001738 if (!res)
1739 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001740 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001741 return res;
1742}
1743
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001744static Py_UCS4
1745kind_maxchar_limit(unsigned int kind)
1746{
1747 switch(kind) {
1748 case PyUnicode_1BYTE_KIND:
1749 return 0x80;
1750 case PyUnicode_2BYTE_KIND:
1751 return 0x100;
1752 case PyUnicode_4BYTE_KIND:
1753 return 0x10000;
1754 default:
1755 assert(0 && "invalid kind");
1756 return 0x10ffff;
1757 }
1758}
1759
Victor Stinner702c7342011-10-05 13:50:52 +02001760static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001761_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001764 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001765
1766 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001767 if (size == 1)
1768 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001769 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001770 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 if (!res)
1772 return NULL;
1773 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001774 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776}
1777
Victor Stinnere57b1c02011-09-28 22:20:48 +02001778static PyObject*
1779_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780{
1781 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001782 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783
1784 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001785 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001786 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001787 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001788 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 if (!res)
1790 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001791 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001793 else {
1794 _PyUnicode_CONVERT_BYTES(
1795 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1796 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001797 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 return res;
1799}
1800
Victor Stinnere57b1c02011-09-28 22:20:48 +02001801static PyObject*
1802_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001805 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806
1807 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001808 if (size == 1 && u[0] < 256)
1809 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001810 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001811 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 if (!res)
1813 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001814 if (max_char < 256)
1815 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1816 PyUnicode_1BYTE_DATA(res));
1817 else if (max_char < 0x10000)
1818 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1819 PyUnicode_2BYTE_DATA(res));
1820 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001822 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 return res;
1824}
1825
1826PyObject*
1827PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1828{
1829 switch(kind) {
1830 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001831 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001833 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001835 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001836 default:
1837 assert(0 && "invalid kind");
1838 PyErr_SetString(PyExc_SystemError, "invalid kind");
1839 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841}
1842
Victor Stinner25a4b292011-10-06 12:31:55 +02001843/* Ensure that a string uses the most efficient storage, if it is not the
1844 case: create a new string with of the right kind. Write NULL into *p_unicode
1845 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001846static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001847unicode_adjust_maxchar(PyObject **p_unicode)
1848{
1849 PyObject *unicode, *copy;
1850 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001851 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001852 unsigned int kind;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856 assert(PyUnicode_IS_READY(unicode));
1857 if (PyUnicode_IS_ASCII(unicode))
1858 return;
1859
1860 len = PyUnicode_GET_LENGTH(unicode);
1861 kind = PyUnicode_KIND(unicode);
1862 if (kind == PyUnicode_1BYTE_KIND) {
1863 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001864 max_char = ucs1lib_find_max_char(u, u + len);
1865 if (max_char >= 128)
1866 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001867 }
1868 else if (kind == PyUnicode_2BYTE_KIND) {
1869 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001870 max_char = ucs2lib_find_max_char(u, u + len);
1871 if (max_char >= 256)
1872 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001873 }
1874 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001875 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001876 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001877 max_char = ucs4lib_find_max_char(u, u + len);
1878 if (max_char >= 0x10000)
1879 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001880 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001881 copy = PyUnicode_New(len, max_char);
1882 copy_characters(copy, 0, unicode, 0, len);
1883 Py_DECREF(unicode);
1884 *p_unicode = copy;
1885}
1886
Victor Stinner034f6cf2011-09-30 02:26:44 +02001887PyObject*
1888PyUnicode_Copy(PyObject *unicode)
1889{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001890 Py_ssize_t size;
1891 PyObject *copy;
1892 void *data;
1893
Victor Stinner034f6cf2011-09-30 02:26:44 +02001894 if (!PyUnicode_Check(unicode)) {
1895 PyErr_BadInternalCall();
1896 return NULL;
1897 }
1898 if (PyUnicode_READY(unicode))
1899 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001900
1901 size = PyUnicode_GET_LENGTH(unicode);
1902 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1903 if (!copy)
1904 return NULL;
1905 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1906
1907 data = PyUnicode_DATA(unicode);
1908 switch (PyUnicode_KIND(unicode))
1909 {
1910 case PyUnicode_1BYTE_KIND:
1911 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1912 break;
1913 case PyUnicode_2BYTE_KIND:
1914 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1915 break;
1916 case PyUnicode_4BYTE_KIND:
1917 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1918 break;
1919 default:
1920 assert(0);
1921 break;
1922 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001923 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001924 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001925}
1926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927
Victor Stinnerbc603d12011-10-02 01:00:40 +02001928/* Widen Unicode objects to larger buffers. Don't write terminating null
1929 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
1931void*
1932_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1933{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001934 Py_ssize_t len;
1935 void *result;
1936 unsigned int skind;
1937
1938 if (PyUnicode_READY(s))
1939 return NULL;
1940
1941 len = PyUnicode_GET_LENGTH(s);
1942 skind = PyUnicode_KIND(s);
1943 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001944 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 return NULL;
1946 }
1947 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001948 case PyUnicode_2BYTE_KIND:
1949 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1950 if (!result)
1951 return PyErr_NoMemory();
1952 assert(skind == PyUnicode_1BYTE_KIND);
1953 _PyUnicode_CONVERT_BYTES(
1954 Py_UCS1, Py_UCS2,
1955 PyUnicode_1BYTE_DATA(s),
1956 PyUnicode_1BYTE_DATA(s) + len,
1957 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001959 case PyUnicode_4BYTE_KIND:
1960 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1961 if (!result)
1962 return PyErr_NoMemory();
1963 if (skind == PyUnicode_2BYTE_KIND) {
1964 _PyUnicode_CONVERT_BYTES(
1965 Py_UCS2, Py_UCS4,
1966 PyUnicode_2BYTE_DATA(s),
1967 PyUnicode_2BYTE_DATA(s) + len,
1968 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001970 else {
1971 assert(skind == PyUnicode_1BYTE_KIND);
1972 _PyUnicode_CONVERT_BYTES(
1973 Py_UCS1, Py_UCS4,
1974 PyUnicode_1BYTE_DATA(s),
1975 PyUnicode_1BYTE_DATA(s) + len,
1976 result);
1977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001979 default:
1980 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 }
Victor Stinner01698042011-10-04 00:04:26 +02001982 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return NULL;
1984}
1985
1986static Py_UCS4*
1987as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1988 int copy_null)
1989{
1990 int kind;
1991 void *data;
1992 Py_ssize_t len, targetlen;
1993 if (PyUnicode_READY(string) == -1)
1994 return NULL;
1995 kind = PyUnicode_KIND(string);
1996 data = PyUnicode_DATA(string);
1997 len = PyUnicode_GET_LENGTH(string);
1998 targetlen = len;
1999 if (copy_null)
2000 targetlen++;
2001 if (!target) {
2002 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2003 PyErr_NoMemory();
2004 return NULL;
2005 }
2006 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2007 if (!target) {
2008 PyErr_NoMemory();
2009 return NULL;
2010 }
2011 }
2012 else {
2013 if (targetsize < targetlen) {
2014 PyErr_Format(PyExc_SystemError,
2015 "string is longer than the buffer");
2016 if (copy_null && 0 < targetsize)
2017 target[0] = 0;
2018 return NULL;
2019 }
2020 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002021 if (kind == PyUnicode_1BYTE_KIND) {
2022 Py_UCS1 *start = (Py_UCS1 *) data;
2023 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002025 else if (kind == PyUnicode_2BYTE_KIND) {
2026 Py_UCS2 *start = (Py_UCS2 *) data;
2027 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2028 }
2029 else {
2030 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (copy_null)
2034 target[len] = 0;
2035 return target;
2036}
2037
2038Py_UCS4*
2039PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2040 int copy_null)
2041{
2042 if (target == NULL || targetsize < 1) {
2043 PyErr_BadInternalCall();
2044 return NULL;
2045 }
2046 return as_ucs4(string, target, targetsize, copy_null);
2047}
2048
2049Py_UCS4*
2050PyUnicode_AsUCS4Copy(PyObject *string)
2051{
2052 return as_ucs4(string, NULL, 0, 1);
2053}
2054
2055#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002056
Alexander Belopolsky40018472011-02-26 01:02:56 +00002057PyObject *
2058PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002061 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 PyErr_BadInternalCall();
2064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 }
2066
Martin v. Löwis790465f2008-04-05 20:41:37 +00002067 if (size == -1) {
2068 size = wcslen(w);
2069 }
2070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072}
2073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002075
Walter Dörwald346737f2007-05-31 10:44:43 +00002076static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002077makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2078 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 *fmt++ = '%';
2081 if (width) {
2082 if (zeropad)
2083 *fmt++ = '0';
2084 fmt += sprintf(fmt, "%d", width);
2085 }
2086 if (precision)
2087 fmt += sprintf(fmt, ".%d", precision);
2088 if (longflag)
2089 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002090 else if (longlongflag) {
2091 /* longlongflag should only ever be nonzero on machines with
2092 HAVE_LONG_LONG defined */
2093#ifdef HAVE_LONG_LONG
2094 char *f = PY_FORMAT_LONG_LONG;
2095 while (*f)
2096 *fmt++ = *f++;
2097#else
2098 /* we shouldn't ever get here */
2099 assert(0);
2100 *fmt++ = 'l';
2101#endif
2102 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 else if (size_tflag) {
2104 char *f = PY_FORMAT_SIZE_T;
2105 while (*f)
2106 *fmt++ = *f++;
2107 }
2108 *fmt++ = c;
2109 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002110}
2111
Victor Stinner96865452011-03-01 23:44:09 +00002112/* helper for PyUnicode_FromFormatV() */
2113
2114static const char*
2115parse_format_flags(const char *f,
2116 int *p_width, int *p_precision,
2117 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2118{
2119 int width, precision, longflag, longlongflag, size_tflag;
2120
2121 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2122 f++;
2123 width = 0;
2124 while (Py_ISDIGIT((unsigned)*f))
2125 width = (width*10) + *f++ - '0';
2126 precision = 0;
2127 if (*f == '.') {
2128 f++;
2129 while (Py_ISDIGIT((unsigned)*f))
2130 precision = (precision*10) + *f++ - '0';
2131 if (*f == '%') {
2132 /* "%.3%s" => f points to "3" */
2133 f--;
2134 }
2135 }
2136 if (*f == '\0') {
2137 /* bogus format "%.1" => go backward, f points to "1" */
2138 f--;
2139 }
2140 if (p_width != NULL)
2141 *p_width = width;
2142 if (p_precision != NULL)
2143 *p_precision = precision;
2144
2145 /* Handle %ld, %lu, %lld and %llu. */
2146 longflag = 0;
2147 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002148 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002149
2150 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002151 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002152 longflag = 1;
2153 ++f;
2154 }
2155#ifdef HAVE_LONG_LONG
2156 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002157 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002158 longlongflag = 1;
2159 f += 2;
2160 }
2161#endif
2162 }
2163 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002164 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002165 size_tflag = 1;
2166 ++f;
2167 }
2168 if (p_longflag != NULL)
2169 *p_longflag = longflag;
2170 if (p_longlongflag != NULL)
2171 *p_longlongflag = longlongflag;
2172 if (p_size_tflag != NULL)
2173 *p_size_tflag = size_tflag;
2174 return f;
2175}
2176
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002177/* maximum number of characters required for output of %ld. 21 characters
2178 allows for 64-bit integers (in decimal) and an optional sign. */
2179#define MAX_LONG_CHARS 21
2180/* maximum number of characters required for output of %lld.
2181 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2182 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2183#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2184
Walter Dörwaldd2034312007-05-18 16:29:38 +00002185PyObject *
2186PyUnicode_FromFormatV(const char *format, va_list vargs)
2187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 va_list count;
2189 Py_ssize_t callcount = 0;
2190 PyObject **callresults = NULL;
2191 PyObject **callresult = NULL;
2192 Py_ssize_t n = 0;
2193 int width = 0;
2194 int precision = 0;
2195 int zeropad;
2196 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002197 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002199 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2201 Py_UCS4 argmaxchar;
2202 Py_ssize_t numbersize = 0;
2203 char *numberresults = NULL;
2204 char *numberresult = NULL;
2205 Py_ssize_t i;
2206 int kind;
2207 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002208
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002209 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002210 /* step 1: count the number of %S/%R/%A/%s format specifications
2211 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2212 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002214 * also estimate a upper bound for all the number formats in the string,
2215 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 for (f = format; *f; f++) {
2218 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002219 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2221 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2222 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2223 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002226#ifdef HAVE_LONG_LONG
2227 if (longlongflag) {
2228 if (width < MAX_LONG_LONG_CHARS)
2229 width = MAX_LONG_LONG_CHARS;
2230 }
2231 else
2232#endif
2233 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2234 including sign. Decimal takes the most space. This
2235 isn't enough for octal. If a width is specified we
2236 need more (which we allocate later). */
2237 if (width < MAX_LONG_CHARS)
2238 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239
2240 /* account for the size + '\0' to separate numbers
2241 inside of the numberresults buffer */
2242 numbersize += (width + 1);
2243 }
2244 }
2245 else if ((unsigned char)*f > 127) {
2246 PyErr_Format(PyExc_ValueError,
2247 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2248 "string, got a non-ASCII byte: 0x%02x",
2249 (unsigned char)*f);
2250 return NULL;
2251 }
2252 }
2253 /* step 2: allocate memory for the results of
2254 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2255 if (callcount) {
2256 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2257 if (!callresults) {
2258 PyErr_NoMemory();
2259 return NULL;
2260 }
2261 callresult = callresults;
2262 }
2263 /* step 2.5: allocate memory for the results of formating numbers */
2264 if (numbersize) {
2265 numberresults = PyObject_Malloc(numbersize);
2266 if (!numberresults) {
2267 PyErr_NoMemory();
2268 goto fail;
2269 }
2270 numberresult = numberresults;
2271 }
2272
2273 /* step 3: format numbers and figure out how large a buffer we need */
2274 for (f = format; *f; f++) {
2275 if (*f == '%') {
2276 const char* p;
2277 int longflag;
2278 int longlongflag;
2279 int size_tflag;
2280 int numprinted;
2281
2282 p = f;
2283 zeropad = (f[1] == '0');
2284 f = parse_format_flags(f, &width, &precision,
2285 &longflag, &longlongflag, &size_tflag);
2286 switch (*f) {
2287 case 'c':
2288 {
2289 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002290 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 n++;
2292 break;
2293 }
2294 case '%':
2295 n++;
2296 break;
2297 case 'i':
2298 case 'd':
2299 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2300 width, precision, *f);
2301 if (longflag)
2302 numprinted = sprintf(numberresult, fmt,
2303 va_arg(count, long));
2304#ifdef HAVE_LONG_LONG
2305 else if (longlongflag)
2306 numprinted = sprintf(numberresult, fmt,
2307 va_arg(count, PY_LONG_LONG));
2308#endif
2309 else if (size_tflag)
2310 numprinted = sprintf(numberresult, fmt,
2311 va_arg(count, Py_ssize_t));
2312 else
2313 numprinted = sprintf(numberresult, fmt,
2314 va_arg(count, int));
2315 n += numprinted;
2316 /* advance by +1 to skip over the '\0' */
2317 numberresult += (numprinted + 1);
2318 assert(*(numberresult - 1) == '\0');
2319 assert(*(numberresult - 2) != '\0');
2320 assert(numprinted >= 0);
2321 assert(numberresult <= numberresults + numbersize);
2322 break;
2323 case 'u':
2324 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2325 width, precision, 'u');
2326 if (longflag)
2327 numprinted = sprintf(numberresult, fmt,
2328 va_arg(count, unsigned long));
2329#ifdef HAVE_LONG_LONG
2330 else if (longlongflag)
2331 numprinted = sprintf(numberresult, fmt,
2332 va_arg(count, unsigned PY_LONG_LONG));
2333#endif
2334 else if (size_tflag)
2335 numprinted = sprintf(numberresult, fmt,
2336 va_arg(count, size_t));
2337 else
2338 numprinted = sprintf(numberresult, fmt,
2339 va_arg(count, unsigned int));
2340 n += numprinted;
2341 numberresult += (numprinted + 1);
2342 assert(*(numberresult - 1) == '\0');
2343 assert(*(numberresult - 2) != '\0');
2344 assert(numprinted >= 0);
2345 assert(numberresult <= numberresults + numbersize);
2346 break;
2347 case 'x':
2348 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2349 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2350 n += numprinted;
2351 numberresult += (numprinted + 1);
2352 assert(*(numberresult - 1) == '\0');
2353 assert(*(numberresult - 2) != '\0');
2354 assert(numprinted >= 0);
2355 assert(numberresult <= numberresults + numbersize);
2356 break;
2357 case 'p':
2358 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2359 /* %p is ill-defined: ensure leading 0x. */
2360 if (numberresult[1] == 'X')
2361 numberresult[1] = 'x';
2362 else if (numberresult[1] != 'x') {
2363 memmove(numberresult + 2, numberresult,
2364 strlen(numberresult) + 1);
2365 numberresult[0] = '0';
2366 numberresult[1] = 'x';
2367 numprinted += 2;
2368 }
2369 n += numprinted;
2370 numberresult += (numprinted + 1);
2371 assert(*(numberresult - 1) == '\0');
2372 assert(*(numberresult - 2) != '\0');
2373 assert(numprinted >= 0);
2374 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 break;
2376 case 's':
2377 {
2378 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002379 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002380 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2381 if (!str)
2382 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 /* since PyUnicode_DecodeUTF8 returns already flexible
2384 unicode objects, there is no need to call ready on them */
2385 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002386 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002388 /* Remember the str and switch to the next slot */
2389 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 break;
2391 }
2392 case 'U':
2393 {
2394 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002395 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 if (PyUnicode_READY(obj) == -1)
2397 goto fail;
2398 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002399 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 break;
2402 }
2403 case 'V':
2404 {
2405 PyObject *obj = va_arg(count, PyObject *);
2406 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002407 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002409 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002410 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 if (PyUnicode_READY(obj) == -1)
2412 goto fail;
2413 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002414 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002416 *callresult++ = NULL;
2417 }
2418 else {
2419 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2420 if (!str_obj)
2421 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002422 if (PyUnicode_READY(str_obj)) {
2423 Py_DECREF(str_obj);
2424 goto fail;
2425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002427 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 *callresult++ = str_obj;
2430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002431 break;
2432 }
2433 case 'S':
2434 {
2435 PyObject *obj = va_arg(count, PyObject *);
2436 PyObject *str;
2437 assert(obj);
2438 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 /* Remember the str and switch to the next slot */
2445 *callresult++ = str;
2446 break;
2447 }
2448 case 'R':
2449 {
2450 PyObject *obj = va_arg(count, PyObject *);
2451 PyObject *repr;
2452 assert(obj);
2453 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002457 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 /* Remember the repr and switch to the next slot */
2460 *callresult++ = repr;
2461 break;
2462 }
2463 case 'A':
2464 {
2465 PyObject *obj = va_arg(count, PyObject *);
2466 PyObject *ascii;
2467 assert(obj);
2468 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002472 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 /* Remember the repr and switch to the next slot */
2475 *callresult++ = ascii;
2476 break;
2477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 default:
2479 /* if we stumble upon an unknown
2480 formatting code, copy the rest of
2481 the format string to the output
2482 string. (we cannot just skip the
2483 code, since there's no way to know
2484 what's in the argument list) */
2485 n += strlen(p);
2486 goto expand;
2487 }
2488 } else
2489 n++;
2490 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002491 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 we don't have to resize the string.
2495 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002496 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 if (!string)
2498 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 kind = PyUnicode_KIND(string);
2500 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002506 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002507
2508 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2510 /* checking for == because the last argument could be a empty
2511 string, which causes i to point to end, the assert at the end of
2512 the loop */
2513 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002514
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 switch (*f) {
2516 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002517 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 const int ordinal = va_arg(vargs, int);
2519 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002521 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002522 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 case 'p':
2527 /* unused, since we already have the result */
2528 if (*f == 'p')
2529 (void) va_arg(vargs, void *);
2530 else
2531 (void) va_arg(vargs, int);
2532 /* extract the result from numberresults and append. */
2533 for (; *numberresult; ++i, ++numberresult)
2534 PyUnicode_WRITE(kind, data, i, *numberresult);
2535 /* skip over the separating '\0' */
2536 assert(*numberresult == '\0');
2537 numberresult++;
2538 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
2540 case 's':
2541 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002542 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002544 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 size = PyUnicode_GET_LENGTH(*callresult);
2546 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002547 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002549 /* We're done with the unicode()/repr() => forget it */
2550 Py_DECREF(*callresult);
2551 /* switch to next unicode()/repr() result */
2552 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 break;
2554 }
2555 case 'U':
2556 {
2557 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 Py_ssize_t size;
2559 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2560 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002561 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 break;
2564 }
2565 case 'V':
2566 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002569 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 size = PyUnicode_GET_LENGTH(obj);
2572 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002573 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 size = PyUnicode_GET_LENGTH(*callresult);
2577 assert(PyUnicode_KIND(*callresult) <=
2578 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002579 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002581 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002583 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
2585 }
2586 case 'S':
2587 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002588 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002590 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 /* unused, since we already have the result */
2592 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002594 copy_characters(string, i, *callresult, 0, size);
2595 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 /* We're done with the unicode()/repr() => forget it */
2597 Py_DECREF(*callresult);
2598 /* switch to next unicode()/repr() result */
2599 ++callresult;
2600 break;
2601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 break;
2605 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 for (; *p; ++p, ++i)
2607 PyUnicode_WRITE(kind, data, i, *p);
2608 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 goto end;
2610 }
Victor Stinner1205f272010-09-11 00:54:47 +00002611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 else {
2613 assert(i < PyUnicode_GET_LENGTH(string));
2614 PyUnicode_WRITE(kind, data, i++, *f);
2615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618
Benjamin Peterson29060642009-01-31 22:14:21 +00002619 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 if (callresults)
2621 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 if (numberresults)
2623 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002624 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 if (callresults) {
2628 PyObject **callresult2 = callresults;
2629 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002630 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 ++callresult2;
2632 }
2633 PyObject_Free(callresults);
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 if (numberresults)
2636 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002638}
2639
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640PyObject *
2641PyUnicode_FromFormat(const char *format, ...)
2642{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 PyObject* ret;
2644 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002645
2646#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002648#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002650#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 ret = PyUnicode_FromFormatV(format, vargs);
2652 va_end(vargs);
2653 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002654}
2655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656#ifdef HAVE_WCHAR_H
2657
Victor Stinner5593d8a2010-10-02 11:11:27 +00002658/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2659 convert a Unicode object to a wide character string.
2660
Victor Stinnerd88d9832011-09-06 02:00:05 +02002661 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002662 character) required to convert the unicode object. Ignore size argument.
2663
Victor Stinnerd88d9832011-09-06 02:00:05 +02002664 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002665 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002666 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002667static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002668unicode_aswidechar(PyUnicodeObject *unicode,
2669 wchar_t *w,
2670 Py_ssize_t size)
2671{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002672 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 const wchar_t *wstr;
2674
2675 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2676 if (wstr == NULL)
2677 return -1;
2678
Victor Stinner5593d8a2010-10-02 11:11:27 +00002679 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680 if (size > res)
2681 size = res + 1;
2682 else
2683 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002685 return res;
2686 }
2687 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002689}
2690
2691Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002692PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002693 wchar_t *w,
2694 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695{
2696 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002697 PyErr_BadInternalCall();
2698 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002700 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701}
2702
Victor Stinner137c34c2010-09-29 10:25:54 +00002703wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002704PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002705 Py_ssize_t *size)
2706{
2707 wchar_t* buffer;
2708 Py_ssize_t buflen;
2709
2710 if (unicode == NULL) {
2711 PyErr_BadInternalCall();
2712 return NULL;
2713 }
2714
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002715 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 if (buflen == -1)
2717 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002719 PyErr_NoMemory();
2720 return NULL;
2721 }
2722
Victor Stinner137c34c2010-09-29 10:25:54 +00002723 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2724 if (buffer == NULL) {
2725 PyErr_NoMemory();
2726 return NULL;
2727 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002728 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (buflen == -1)
2730 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size != NULL)
2732 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002733 return buffer;
2734}
2735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
Alexander Belopolsky40018472011-02-26 01:02:56 +00002738PyObject *
2739PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002742 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002743 PyErr_SetString(PyExc_ValueError,
2744 "chr() arg not in range(0x110000)");
2745 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002746 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 if (ordinal < 256)
2749 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 v = PyUnicode_New(1, ordinal);
2752 if (v == NULL)
2753 return NULL;
2754 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002755 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002757}
2758
Alexander Belopolsky40018472011-02-26 01:02:56 +00002759PyObject *
2760PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002762 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002764 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002765 if (PyUnicode_READY(obj))
2766 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 Py_INCREF(obj);
2768 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002769 }
2770 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 /* For a Unicode subtype that's not a Unicode object,
2772 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002773 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002774 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002775 PyErr_Format(PyExc_TypeError,
2776 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002777 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002778 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002779}
2780
Alexander Belopolsky40018472011-02-26 01:02:56 +00002781PyObject *
2782PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002783 const char *encoding,
2784 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002785{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002786 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002787 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002788
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 PyErr_BadInternalCall();
2791 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002793
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002794 /* Decoding bytes objects is the most common case and should be fast */
2795 if (PyBytes_Check(obj)) {
2796 if (PyBytes_GET_SIZE(obj) == 0) {
2797 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002798 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002799 }
2800 else {
2801 v = PyUnicode_Decode(
2802 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2803 encoding, errors);
2804 }
2805 return v;
2806 }
2807
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 PyErr_SetString(PyExc_TypeError,
2810 "decoding str is not supported");
2811 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002814 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2815 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2816 PyErr_Format(PyExc_TypeError,
2817 "coercing to str: need bytes, bytearray "
2818 "or buffer-like object, %.80s found",
2819 Py_TYPE(obj)->tp_name);
2820 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002821 }
Tim Petersced69f82003-09-16 20:30:58 +00002822
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002825 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 }
Tim Petersced69f82003-09-16 20:30:58 +00002827 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002829
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002830 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002831 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832}
2833
Victor Stinner600d3be2010-06-10 12:00:55 +00002834/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002835 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2836 1 on success. */
2837static int
2838normalize_encoding(const char *encoding,
2839 char *lower,
2840 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002842 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002843 char *l;
2844 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002846 if (encoding == NULL) {
2847 strcpy(lower, "utf-8");
2848 return 1;
2849 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002850 e = encoding;
2851 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002852 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002853 while (*e) {
2854 if (l == l_end)
2855 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002856 if (Py_ISUPPER(*e)) {
2857 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002858 }
2859 else if (*e == '_') {
2860 *l++ = '-';
2861 e++;
2862 }
2863 else {
2864 *l++ = *e++;
2865 }
2866 }
2867 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002868 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002869}
2870
Alexander Belopolsky40018472011-02-26 01:02:56 +00002871PyObject *
2872PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002873 Py_ssize_t size,
2874 const char *encoding,
2875 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002876{
2877 PyObject *buffer = NULL, *unicode;
2878 Py_buffer info;
2879 char lower[11]; /* Enough for any encoding shortcut */
2880
Fred Drakee4315f52000-05-09 19:53:39 +00002881 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002882 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002883 if ((strcmp(lower, "utf-8") == 0) ||
2884 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002885 return PyUnicode_DecodeUTF8(s, size, errors);
2886 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002887 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002888 (strcmp(lower, "iso-8859-1") == 0))
2889 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002890#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002891 else if (strcmp(lower, "mbcs") == 0)
2892 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002893#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002894 else if (strcmp(lower, "ascii") == 0)
2895 return PyUnicode_DecodeASCII(s, size, errors);
2896 else if (strcmp(lower, "utf-16") == 0)
2897 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2898 else if (strcmp(lower, "utf-32") == 0)
2899 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
2902 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002903 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002904 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002905 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002906 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 if (buffer == NULL)
2908 goto onError;
2909 unicode = PyCodec_Decode(buffer, encoding, errors);
2910 if (unicode == NULL)
2911 goto onError;
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002914 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002915 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 Py_DECREF(unicode);
2917 goto onError;
2918 }
2919 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002920#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002921 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 Py_DECREF(unicode);
2923 return NULL;
2924 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002925#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002928
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 Py_XDECREF(buffer);
2931 return NULL;
2932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 const char *encoding,
2937 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002938{
2939 PyObject *v;
2940
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_BadArgument();
2943 goto onError;
2944 }
2945
2946 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002948
2949 /* Decode via the codec registry */
2950 v = PyCodec_Decode(unicode, encoding, errors);
2951 if (v == NULL)
2952 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002953 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002954 return v;
2955
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002957 return NULL;
2958}
2959
Alexander Belopolsky40018472011-02-26 01:02:56 +00002960PyObject *
2961PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002962 const char *encoding,
2963 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002964{
2965 PyObject *v;
2966
2967 if (!PyUnicode_Check(unicode)) {
2968 PyErr_BadArgument();
2969 goto onError;
2970 }
2971
2972 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002974
2975 /* Decode via the codec registry */
2976 v = PyCodec_Decode(unicode, encoding, errors);
2977 if (v == NULL)
2978 goto onError;
2979 if (!PyUnicode_Check(v)) {
2980 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002981 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002982 Py_TYPE(v)->tp_name);
2983 Py_DECREF(v);
2984 goto onError;
2985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002986 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002987 return v;
2988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002990 return NULL;
2991}
2992
Alexander Belopolsky40018472011-02-26 01:02:56 +00002993PyObject *
2994PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002995 Py_ssize_t size,
2996 const char *encoding,
2997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998{
2999 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003000
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 unicode = PyUnicode_FromUnicode(s, size);
3002 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3005 Py_DECREF(unicode);
3006 return v;
3007}
3008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003011 const char *encoding,
3012 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003013{
3014 PyObject *v;
3015
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 goto onError;
3019 }
3020
3021 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003023
3024 /* Encode via the codec registry */
3025 v = PyCodec_Encode(unicode, encoding, errors);
3026 if (v == NULL)
3027 goto onError;
3028 return v;
3029
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003031 return NULL;
3032}
3033
Victor Stinnerad158722010-10-27 00:25:46 +00003034PyObject *
3035PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003036{
Victor Stinner99b95382011-07-04 14:23:54 +02003037#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3039 PyUnicode_GET_SIZE(unicode),
3040 NULL);
3041#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003042 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003043#else
Victor Stinner793b5312011-04-27 00:24:21 +02003044 PyInterpreterState *interp = PyThreadState_GET()->interp;
3045 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3046 cannot use it to encode and decode filenames before it is loaded. Load
3047 the Python codec requires to encode at least its own filename. Use the C
3048 version of the locale codec until the codec registry is initialized and
3049 the Python codec is loaded.
3050
3051 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3052 cannot only rely on it: check also interp->fscodec_initialized for
3053 subinterpreters. */
3054 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003055 return PyUnicode_AsEncodedString(unicode,
3056 Py_FileSystemDefaultEncoding,
3057 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003058 }
3059 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003060 /* locale encoding with surrogateescape */
3061 wchar_t *wchar;
3062 char *bytes;
3063 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003064 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003065
3066 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3067 if (wchar == NULL)
3068 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003069 bytes = _Py_wchar2char(wchar, &error_pos);
3070 if (bytes == NULL) {
3071 if (error_pos != (size_t)-1) {
3072 char *errmsg = strerror(errno);
3073 PyObject *exc = NULL;
3074 if (errmsg == NULL)
3075 errmsg = "Py_wchar2char() failed";
3076 raise_encode_exception(&exc,
3077 "filesystemencoding",
3078 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3079 error_pos, error_pos+1,
3080 errmsg);
3081 Py_XDECREF(exc);
3082 }
3083 else
3084 PyErr_NoMemory();
3085 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003086 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003087 }
3088 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003089
3090 bytes_obj = PyBytes_FromString(bytes);
3091 PyMem_Free(bytes);
3092 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003093 }
Victor Stinnerad158722010-10-27 00:25:46 +00003094#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101{
3102 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003103 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003104
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 }
Fred Drakee4315f52000-05-09 19:53:39 +00003109
Fred Drakee4315f52000-05-09 19:53:39 +00003110 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003111 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003112 if ((strcmp(lower, "utf-8") == 0) ||
3113 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003114 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003115 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003116 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003117 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003119 }
Victor Stinner37296e82010-06-10 13:36:23 +00003120 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003121 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003122 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003124#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003125 else if (strcmp(lower, "mbcs") == 0)
3126 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3127 PyUnicode_GET_SIZE(unicode),
3128 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003129#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003130 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003131 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133
3134 /* Encode via the codec registry */
3135 v = PyCodec_Encode(unicode, encoding, errors);
3136 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003137 return NULL;
3138
3139 /* The normal path */
3140 if (PyBytes_Check(v))
3141 return v;
3142
3143 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003144 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003145 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003146 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003147
3148 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3149 "encoder %s returned bytearray instead of bytes",
3150 encoding);
3151 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003152 Py_DECREF(v);
3153 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003154 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003155
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003156 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3157 Py_DECREF(v);
3158 return b;
3159 }
3160
3161 PyErr_Format(PyExc_TypeError,
3162 "encoder did not return a bytes object (type=%.400s)",
3163 Py_TYPE(v)->tp_name);
3164 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003165 return NULL;
3166}
3167
Alexander Belopolsky40018472011-02-26 01:02:56 +00003168PyObject *
3169PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003170 const char *encoding,
3171 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172{
3173 PyObject *v;
3174
3175 if (!PyUnicode_Check(unicode)) {
3176 PyErr_BadArgument();
3177 goto onError;
3178 }
3179
3180 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003182
3183 /* Encode via the codec registry */
3184 v = PyCodec_Encode(unicode, encoding, errors);
3185 if (v == NULL)
3186 goto onError;
3187 if (!PyUnicode_Check(v)) {
3188 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003189 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190 Py_TYPE(v)->tp_name);
3191 Py_DECREF(v);
3192 goto onError;
3193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003195
Benjamin Peterson29060642009-01-31 22:14:21 +00003196 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 return NULL;
3198}
3199
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003200PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003201PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003202 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003203 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3204}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003205
Christian Heimes5894ba72007-11-04 11:43:14 +00003206PyObject*
3207PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3208{
Victor Stinner99b95382011-07-04 14:23:54 +02003209#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003210 return PyUnicode_DecodeMBCS(s, size, NULL);
3211#elif defined(__APPLE__)
3212 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3213#else
Victor Stinner793b5312011-04-27 00:24:21 +02003214 PyInterpreterState *interp = PyThreadState_GET()->interp;
3215 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3216 cannot use it to encode and decode filenames before it is loaded. Load
3217 the Python codec requires to encode at least its own filename. Use the C
3218 version of the locale codec until the codec registry is initialized and
3219 the Python codec is loaded.
3220
3221 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3222 cannot only rely on it: check also interp->fscodec_initialized for
3223 subinterpreters. */
3224 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003225 return PyUnicode_Decode(s, size,
3226 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003227 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003228 }
3229 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003230 /* locale encoding with surrogateescape */
3231 wchar_t *wchar;
3232 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003233 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003234
3235 if (s[size] != '\0' || size != strlen(s)) {
3236 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3237 return NULL;
3238 }
3239
Victor Stinner168e1172010-10-16 23:16:16 +00003240 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003241 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003242 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003243
Victor Stinner168e1172010-10-16 23:16:16 +00003244 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003245 PyMem_Free(wchar);
3246 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003247 }
Victor Stinnerad158722010-10-27 00:25:46 +00003248#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003249}
3250
Martin v. Löwis011e8422009-05-05 04:43:17 +00003251
3252int
3253PyUnicode_FSConverter(PyObject* arg, void* addr)
3254{
3255 PyObject *output = NULL;
3256 Py_ssize_t size;
3257 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003258 if (arg == NULL) {
3259 Py_DECREF(*(PyObject**)addr);
3260 return 1;
3261 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003262 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003263 output = arg;
3264 Py_INCREF(output);
3265 }
3266 else {
3267 arg = PyUnicode_FromObject(arg);
3268 if (!arg)
3269 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003270 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003271 Py_DECREF(arg);
3272 if (!output)
3273 return 0;
3274 if (!PyBytes_Check(output)) {
3275 Py_DECREF(output);
3276 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3277 return 0;
3278 }
3279 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003280 size = PyBytes_GET_SIZE(output);
3281 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003282 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003283 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003284 Py_DECREF(output);
3285 return 0;
3286 }
3287 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003288 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003289}
3290
3291
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003292int
3293PyUnicode_FSDecoder(PyObject* arg, void* addr)
3294{
3295 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003296 if (arg == NULL) {
3297 Py_DECREF(*(PyObject**)addr);
3298 return 1;
3299 }
3300 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 if (PyUnicode_READY(arg))
3302 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003303 output = arg;
3304 Py_INCREF(output);
3305 }
3306 else {
3307 arg = PyBytes_FromObject(arg);
3308 if (!arg)
3309 return 0;
3310 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3311 PyBytes_GET_SIZE(arg));
3312 Py_DECREF(arg);
3313 if (!output)
3314 return 0;
3315 if (!PyUnicode_Check(output)) {
3316 Py_DECREF(output);
3317 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3318 return 0;
3319 }
3320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003321 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003322 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003323 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3324 Py_DECREF(output);
3325 return 0;
3326 }
3327 *(PyObject**)addr = output;
3328 return Py_CLEANUP_SUPPORTED;
3329}
3330
3331
Martin v. Löwis5b222132007-06-10 09:51:05 +00003332char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003333PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003334{
Christian Heimesf3863112007-11-22 07:46:41 +00003335 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3337
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003338 if (!PyUnicode_Check(unicode)) {
3339 PyErr_BadArgument();
3340 return NULL;
3341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003343 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003345 if (PyUnicode_UTF8(unicode) == NULL) {
3346 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3348 if (bytes == NULL)
3349 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003350 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3351 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 Py_DECREF(bytes);
3353 return NULL;
3354 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003355 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3356 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 Py_DECREF(bytes);
3358 }
3359
3360 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003361 *psize = PyUnicode_UTF8_LENGTH(unicode);
3362 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003363}
3364
3365char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003366PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003368 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3369}
3370
3371#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003372static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003373#endif
3374
3375
3376Py_UNICODE *
3377PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3378{
3379 PyUnicodeObject *u;
3380 const unsigned char *one_byte;
3381#if SIZEOF_WCHAR_T == 4
3382 const Py_UCS2 *two_bytes;
3383#else
3384 const Py_UCS4 *four_bytes;
3385 const Py_UCS4 *ucs4_end;
3386 Py_ssize_t num_surrogates;
3387#endif
3388 wchar_t *w;
3389 wchar_t *wchar_end;
3390
3391 if (!PyUnicode_Check(unicode)) {
3392 PyErr_BadArgument();
3393 return NULL;
3394 }
3395 u = (PyUnicodeObject*)unicode;
3396 if (_PyUnicode_WSTR(u) == NULL) {
3397 /* Non-ASCII compact unicode object */
3398 assert(_PyUnicode_KIND(u) != 0);
3399 assert(PyUnicode_IS_READY(u));
3400
3401#ifdef Py_DEBUG
3402 ++unicode_as_unicode_calls;
3403#endif
3404
3405 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3406#if SIZEOF_WCHAR_T == 2
3407 four_bytes = PyUnicode_4BYTE_DATA(u);
3408 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3409 num_surrogates = 0;
3410
3411 for (; four_bytes < ucs4_end; ++four_bytes) {
3412 if (*four_bytes > 0xFFFF)
3413 ++num_surrogates;
3414 }
3415
3416 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3417 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3418 if (!_PyUnicode_WSTR(u)) {
3419 PyErr_NoMemory();
3420 return NULL;
3421 }
3422 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3423
3424 w = _PyUnicode_WSTR(u);
3425 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3426 four_bytes = PyUnicode_4BYTE_DATA(u);
3427 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3428 if (*four_bytes > 0xFFFF) {
3429 /* encode surrogate pair in this case */
3430 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3431 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3432 }
3433 else
3434 *w = *four_bytes;
3435
3436 if (w > wchar_end) {
3437 assert(0 && "Miscalculated string end");
3438 }
3439 }
3440 *w = 0;
3441#else
3442 /* sizeof(wchar_t) == 4 */
3443 Py_FatalError("Impossible unicode object state, wstr and str "
3444 "should share memory already.");
3445 return NULL;
3446#endif
3447 }
3448 else {
3449 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3450 (_PyUnicode_LENGTH(u) + 1));
3451 if (!_PyUnicode_WSTR(u)) {
3452 PyErr_NoMemory();
3453 return NULL;
3454 }
3455 if (!PyUnicode_IS_COMPACT_ASCII(u))
3456 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3457 w = _PyUnicode_WSTR(u);
3458 wchar_end = w + _PyUnicode_LENGTH(u);
3459
3460 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3461 one_byte = PyUnicode_1BYTE_DATA(u);
3462 for (; w < wchar_end; ++one_byte, ++w)
3463 *w = *one_byte;
3464 /* null-terminate the wstr */
3465 *w = 0;
3466 }
3467 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3468#if SIZEOF_WCHAR_T == 4
3469 two_bytes = PyUnicode_2BYTE_DATA(u);
3470 for (; w < wchar_end; ++two_bytes, ++w)
3471 *w = *two_bytes;
3472 /* null-terminate the wstr */
3473 *w = 0;
3474#else
3475 /* sizeof(wchar_t) == 2 */
3476 PyObject_FREE(_PyUnicode_WSTR(u));
3477 _PyUnicode_WSTR(u) = NULL;
3478 Py_FatalError("Impossible unicode object state, wstr "
3479 "and str should share memory already.");
3480 return NULL;
3481#endif
3482 }
3483 else {
3484 assert(0 && "This should never happen.");
3485 }
3486 }
3487 }
3488 if (size != NULL)
3489 *size = PyUnicode_WSTR_LENGTH(u);
3490 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003491}
3492
Alexander Belopolsky40018472011-02-26 01:02:56 +00003493Py_UNICODE *
3494PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003496 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497}
3498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003499
Alexander Belopolsky40018472011-02-26 01:02:56 +00003500Py_ssize_t
3501PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502{
3503 if (!PyUnicode_Check(unicode)) {
3504 PyErr_BadArgument();
3505 goto onError;
3506 }
3507 return PyUnicode_GET_SIZE(unicode);
3508
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 return -1;
3511}
3512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513Py_ssize_t
3514PyUnicode_GetLength(PyObject *unicode)
3515{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003516 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517 PyErr_BadArgument();
3518 return -1;
3519 }
3520
3521 return PyUnicode_GET_LENGTH(unicode);
3522}
3523
3524Py_UCS4
3525PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3526{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003527 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3528 PyErr_BadArgument();
3529 return (Py_UCS4)-1;
3530 }
3531 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3532 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003533 return (Py_UCS4)-1;
3534 }
3535 return PyUnicode_READ_CHAR(unicode, index);
3536}
3537
3538int
3539PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3540{
3541 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003542 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003543 return -1;
3544 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003545 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3546 PyErr_SetString(PyExc_IndexError, "string index out of range");
3547 return -1;
3548 }
3549 if (_PyUnicode_Dirty(unicode))
3550 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3552 index, ch);
3553 return 0;
3554}
3555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556const char *
3557PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003558{
Victor Stinner42cb4622010-09-01 19:39:01 +00003559 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003560}
3561
Victor Stinner554f3f02010-06-16 23:33:54 +00003562/* create or adjust a UnicodeDecodeError */
3563static void
3564make_decode_exception(PyObject **exceptionObject,
3565 const char *encoding,
3566 const char *input, Py_ssize_t length,
3567 Py_ssize_t startpos, Py_ssize_t endpos,
3568 const char *reason)
3569{
3570 if (*exceptionObject == NULL) {
3571 *exceptionObject = PyUnicodeDecodeError_Create(
3572 encoding, input, length, startpos, endpos, reason);
3573 }
3574 else {
3575 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3576 goto onError;
3577 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3578 goto onError;
3579 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3580 goto onError;
3581 }
3582 return;
3583
3584onError:
3585 Py_DECREF(*exceptionObject);
3586 *exceptionObject = NULL;
3587}
3588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589/* error handling callback helper:
3590 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003591 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 and adjust various state variables.
3593 return 0 on success, -1 on error
3594*/
3595
Alexander Belopolsky40018472011-02-26 01:02:56 +00003596static int
3597unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003598 const char *encoding, const char *reason,
3599 const char **input, const char **inend, Py_ssize_t *startinpos,
3600 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3601 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003603 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604
3605 PyObject *restuple = NULL;
3606 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003607 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003608 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003609 Py_ssize_t requiredsize;
3610 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003612 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003613 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 int res = -1;
3615
3616 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 *errorHandler = PyCodec_LookupError(errors);
3618 if (*errorHandler == NULL)
3619 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 }
3621
Victor Stinner554f3f02010-06-16 23:33:54 +00003622 make_decode_exception(exceptionObject,
3623 encoding,
3624 *input, *inend - *input,
3625 *startinpos, *endinpos,
3626 reason);
3627 if (*exceptionObject == NULL)
3628 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629
3630 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3631 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003634 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 }
3637 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003639
3640 /* Copy back the bytes variables, which might have been modified by the
3641 callback */
3642 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3643 if (!inputobj)
3644 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003645 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003647 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003648 *input = PyBytes_AS_STRING(inputobj);
3649 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003650 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003651 /* we can DECREF safely, as the exception has another reference,
3652 so the object won't go away. */
3653 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003657 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3659 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661
3662 /* need more space? (at least enough for what we
3663 have+the replacement+the rest of the string (starting
3664 at the new input position), so we won't have to check space
3665 when there are no errors in the rest of the string) */
3666 repptr = PyUnicode_AS_UNICODE(repunicode);
3667 repsize = PyUnicode_GET_SIZE(repunicode);
3668 requiredsize = *outpos + repsize + insize-newpos;
3669 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 if (requiredsize<2*outsize)
3671 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003672 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 goto onError;
3674 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 }
3676 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003677 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 Py_UNICODE_COPY(*outptr, repptr, repsize);
3679 *outptr += repsize;
3680 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 /* we made it! */
3683 res = 0;
3684
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_XDECREF(restuple);
3687 return res;
3688}
3689
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690/* --- UTF-7 Codec -------------------------------------------------------- */
3691
Antoine Pitrou244651a2009-05-04 18:56:13 +00003692/* See RFC2152 for details. We encode conservatively and decode liberally. */
3693
3694/* Three simple macros defining base-64. */
3695
3696/* Is c a base-64 character? */
3697
3698#define IS_BASE64(c) \
3699 (((c) >= 'A' && (c) <= 'Z') || \
3700 ((c) >= 'a' && (c) <= 'z') || \
3701 ((c) >= '0' && (c) <= '9') || \
3702 (c) == '+' || (c) == '/')
3703
3704/* given that c is a base-64 character, what is its base-64 value? */
3705
3706#define FROM_BASE64(c) \
3707 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3708 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3709 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3710 (c) == '+' ? 62 : 63)
3711
3712/* What is the base-64 character of the bottom 6 bits of n? */
3713
3714#define TO_BASE64(n) \
3715 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3716
3717/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3718 * decoded as itself. We are permissive on decoding; the only ASCII
3719 * byte not decoding to itself is the + which begins a base64
3720 * string. */
3721
3722#define DECODE_DIRECT(c) \
3723 ((c) <= 127 && (c) != '+')
3724
3725/* The UTF-7 encoder treats ASCII characters differently according to
3726 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3727 * the above). See RFC2152. This array identifies these different
3728 * sets:
3729 * 0 : "Set D"
3730 * alphanumeric and '(),-./:?
3731 * 1 : "Set O"
3732 * !"#$%&*;<=>@[]^_`{|}
3733 * 2 : "whitespace"
3734 * ht nl cr sp
3735 * 3 : special (must be base64 encoded)
3736 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3737 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738
Tim Petersced69f82003-09-16 20:30:58 +00003739static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003740char utf7_category[128] = {
3741/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3742 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3743/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3745/* sp ! " # $ % & ' ( ) * + , - . / */
3746 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3747/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3749/* @ A B C D E F G H I J K L M N O */
3750 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3751/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3753/* ` a b c d e f g h i j k l m n o */
3754 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3755/* p q r s t u v w x y z { | } ~ del */
3756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003757};
3758
Antoine Pitrou244651a2009-05-04 18:56:13 +00003759/* ENCODE_DIRECT: this character should be encoded as itself. The
3760 * answer depends on whether we are encoding set O as itself, and also
3761 * on whether we are encoding whitespace as itself. RFC2152 makes it
3762 * clear that the answers to these questions vary between
3763 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003764
Antoine Pitrou244651a2009-05-04 18:56:13 +00003765#define ENCODE_DIRECT(c, directO, directWS) \
3766 ((c) < 128 && (c) > 0 && \
3767 ((utf7_category[(c)] == 0) || \
3768 (directWS && (utf7_category[(c)] == 2)) || \
3769 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003770
Alexander Belopolsky40018472011-02-26 01:02:56 +00003771PyObject *
3772PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003773 Py_ssize_t size,
3774 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003775{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003776 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3777}
3778
Antoine Pitrou244651a2009-05-04 18:56:13 +00003779/* The decoder. The only state we preserve is our read position,
3780 * i.e. how many characters we have consumed. So if we end in the
3781 * middle of a shift sequence we have to back off the read position
3782 * and the output to the beginning of the sequence, otherwise we lose
3783 * all the shift state (seen bits, number of bits seen, high
3784 * surrogate). */
3785
Alexander Belopolsky40018472011-02-26 01:02:56 +00003786PyObject *
3787PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003788 Py_ssize_t size,
3789 const char *errors,
3790 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t startinpos;
3794 Py_ssize_t endinpos;
3795 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796 const char *e;
3797 PyUnicodeObject *unicode;
3798 Py_UNICODE *p;
3799 const char *errmsg = "";
3800 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003801 Py_UNICODE *shiftOutStart;
3802 unsigned int base64bits = 0;
3803 unsigned long base64buffer = 0;
3804 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 PyObject *errorHandler = NULL;
3806 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807
3808 unicode = _PyUnicode_New(size);
3809 if (!unicode)
3810 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003811 if (size == 0) {
3812 if (consumed)
3813 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003814 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003815 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819 e = s + size;
3820
3821 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003824 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825
Antoine Pitrou244651a2009-05-04 18:56:13 +00003826 if (inShift) { /* in a base-64 section */
3827 if (IS_BASE64(ch)) { /* consume a base-64 character */
3828 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3829 base64bits += 6;
3830 s++;
3831 if (base64bits >= 16) {
3832 /* we have enough bits for a UTF-16 value */
3833 Py_UNICODE outCh = (Py_UNICODE)
3834 (base64buffer >> (base64bits-16));
3835 base64bits -= 16;
3836 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3837 if (surrogate) {
3838 /* expecting a second surrogate */
3839 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3840#ifdef Py_UNICODE_WIDE
3841 *p++ = (((surrogate & 0x3FF)<<10)
3842 | (outCh & 0x3FF)) + 0x10000;
3843#else
3844 *p++ = surrogate;
3845 *p++ = outCh;
3846#endif
3847 surrogate = 0;
3848 }
3849 else {
3850 surrogate = 0;
3851 errmsg = "second surrogate missing";
3852 goto utf7Error;
3853 }
3854 }
3855 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3856 /* first surrogate */
3857 surrogate = outCh;
3858 }
3859 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3860 errmsg = "unexpected second surrogate";
3861 goto utf7Error;
3862 }
3863 else {
3864 *p++ = outCh;
3865 }
3866 }
3867 }
3868 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003869 inShift = 0;
3870 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003871 if (surrogate) {
3872 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003873 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003875 if (base64bits > 0) { /* left-over bits */
3876 if (base64bits >= 6) {
3877 /* We've seen at least one base-64 character */
3878 errmsg = "partial character in shift sequence";
3879 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003881 else {
3882 /* Some bits remain; they should be zero */
3883 if (base64buffer != 0) {
3884 errmsg = "non-zero padding bits in shift sequence";
3885 goto utf7Error;
3886 }
3887 }
3888 }
3889 if (ch != '-') {
3890 /* '-' is absorbed; other terminating
3891 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892 *p++ = ch;
3893 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003894 }
3895 }
3896 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 s++; /* consume '+' */
3899 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003900 s++;
3901 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 }
3903 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003904 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 shiftOutStart = p;
3906 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003907 }
3908 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003910 *p++ = ch;
3911 s++;
3912 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003913 else {
3914 startinpos = s-starts;
3915 s++;
3916 errmsg = "unexpected special character";
3917 goto utf7Error;
3918 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003919 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 outpos = p-PyUnicode_AS_UNICODE(unicode);
3922 endinpos = s-starts;
3923 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 errors, &errorHandler,
3925 "utf7", errmsg,
3926 &starts, &e, &startinpos, &endinpos, &exc, &s,
3927 &unicode, &outpos, &p))
3928 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003929 }
3930
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 /* end of string */
3932
3933 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3934 /* if we're in an inconsistent state, that's an error */
3935 if (surrogate ||
3936 (base64bits >= 6) ||
3937 (base64bits > 0 && base64buffer != 0)) {
3938 outpos = p-PyUnicode_AS_UNICODE(unicode);
3939 endinpos = size;
3940 if (unicode_decode_call_errorhandler(
3941 errors, &errorHandler,
3942 "utf7", "unterminated shift sequence",
3943 &starts, &e, &startinpos, &endinpos, &exc, &s,
3944 &unicode, &outpos, &p))
3945 goto onError;
3946 if (s < e)
3947 goto restart;
3948 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003950
3951 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003952 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 if (inShift) {
3954 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003955 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003956 }
3957 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003958 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003959 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003960 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961
Victor Stinnerfe226c02011-10-03 03:52:20 +02003962 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003963 goto onError;
3964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 Py_XDECREF(errorHandler);
3966 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003967#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003968 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969 Py_DECREF(unicode);
3970 return NULL;
3971 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003972#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003973 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 return (PyObject *)unicode;
3975
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 Py_XDECREF(errorHandler);
3978 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003979 Py_DECREF(unicode);
3980 return NULL;
3981}
3982
3983
Alexander Belopolsky40018472011-02-26 01:02:56 +00003984PyObject *
3985PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003986 Py_ssize_t size,
3987 int base64SetO,
3988 int base64WhiteSpace,
3989 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003991 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003993 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003994 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003996 unsigned int base64bits = 0;
3997 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 char * out;
3999 char * start;
4000
4001 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004003
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00004004 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004005 return PyErr_NoMemory();
4006
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004008 if (v == NULL)
4009 return NULL;
4010
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004011 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004012 for (;i < size; ++i) {
4013 Py_UNICODE ch = s[i];
4014
Antoine Pitrou244651a2009-05-04 18:56:13 +00004015 if (inShift) {
4016 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4017 /* shifting out */
4018 if (base64bits) { /* output remaining bits */
4019 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4020 base64buffer = 0;
4021 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004022 }
4023 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024 /* Characters not in the BASE64 set implicitly unshift the sequence
4025 so no '-' is required, except if the character is itself a '-' */
4026 if (IS_BASE64(ch) || ch == '-') {
4027 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004029 *out++ = (char) ch;
4030 }
4031 else {
4032 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004033 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004035 else { /* not in a shift sequence */
4036 if (ch == '+') {
4037 *out++ = '+';
4038 *out++ = '-';
4039 }
4040 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4041 *out++ = (char) ch;
4042 }
4043 else {
4044 *out++ = '+';
4045 inShift = 1;
4046 goto encode_char;
4047 }
4048 }
4049 continue;
4050encode_char:
4051#ifdef Py_UNICODE_WIDE
4052 if (ch >= 0x10000) {
4053 /* code first surrogate */
4054 base64bits += 16;
4055 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4056 while (base64bits >= 6) {
4057 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4058 base64bits -= 6;
4059 }
4060 /* prepare second surrogate */
4061 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4062 }
4063#endif
4064 base64bits += 16;
4065 base64buffer = (base64buffer << 16) | ch;
4066 while (base64bits >= 6) {
4067 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4068 base64bits -= 6;
4069 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004070 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004071 if (base64bits)
4072 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4073 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004074 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 if (_PyBytes_Resize(&v, out - start) < 0)
4076 return NULL;
4077 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078}
4079
Antoine Pitrou244651a2009-05-04 18:56:13 +00004080#undef IS_BASE64
4081#undef FROM_BASE64
4082#undef TO_BASE64
4083#undef DECODE_DIRECT
4084#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086/* --- UTF-8 Codec -------------------------------------------------------- */
4087
Tim Petersced69f82003-09-16 20:30:58 +00004088static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004090 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4091 illegal prefix. See RFC 3629 for details */
4092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4104 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4105 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4106 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4107 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108};
4109
Alexander Belopolsky40018472011-02-26 01:02:56 +00004110PyObject *
4111PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004112 Py_ssize_t size,
4113 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114{
Walter Dörwald69652032004-09-07 20:24:22 +00004115 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4116}
4117
Antoine Pitrouab868312009-01-10 15:40:25 +00004118/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4119#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4120
4121/* Mask to quickly check whether a C 'long' contains a
4122 non-ASCII, UTF8-encoded char. */
4123#if (SIZEOF_LONG == 8)
4124# define ASCII_CHAR_MASK 0x8080808080808080L
4125#elif (SIZEOF_LONG == 4)
4126# define ASCII_CHAR_MASK 0x80808080L
4127#else
4128# error C 'long' size should be either 4 or 8!
4129#endif
4130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131/* Scans a UTF-8 string and returns the maximum character to be expected,
4132 the size of the decoded unicode string and if any major errors were
4133 encountered.
4134
4135 This function does check basic UTF-8 sanity, it does however NOT CHECK
4136 if the string contains surrogates, and if all continuation bytes are
4137 within the correct ranges, these checks are performed in
4138 PyUnicode_DecodeUTF8Stateful.
4139
4140 If it sets has_errors to 1, it means the value of unicode_size and max_char
4141 will be bogus and you should not rely on useful information in them.
4142 */
4143static Py_UCS4
4144utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4145 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4146 int *has_errors)
4147{
4148 Py_ssize_t n;
4149 Py_ssize_t char_count = 0;
4150 Py_UCS4 max_char = 127, new_max;
4151 Py_UCS4 upper_bound;
4152 const unsigned char *p = (const unsigned char *)s;
4153 const unsigned char *end = p + string_size;
4154 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4155 int err = 0;
4156
4157 for (; p < end && !err; ++p, ++char_count) {
4158 /* Only check value if it's not a ASCII char... */
4159 if (*p < 0x80) {
4160 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4161 an explanation. */
4162 if (!((size_t) p & LONG_PTR_MASK)) {
4163 /* Help register allocation */
4164 register const unsigned char *_p = p;
4165 while (_p < aligned_end) {
4166 unsigned long value = *(unsigned long *) _p;
4167 if (value & ASCII_CHAR_MASK)
4168 break;
4169 _p += SIZEOF_LONG;
4170 char_count += SIZEOF_LONG;
4171 }
4172 p = _p;
4173 if (p == end)
4174 break;
4175 }
4176 }
4177 if (*p >= 0x80) {
4178 n = utf8_code_length[*p];
4179 new_max = max_char;
4180 switch (n) {
4181 /* invalid start byte */
4182 case 0:
4183 err = 1;
4184 break;
4185 case 2:
4186 /* Code points between 0x00FF and 0x07FF inclusive.
4187 Approximate the upper bound of the code point,
4188 if this flips over 255 we can be sure it will be more
4189 than 255 and the string will need 2 bytes per code coint,
4190 if it stays under or equal to 255, we can be sure 1 byte
4191 is enough.
4192 ((*p & 0b00011111) << 6) | 0b00111111 */
4193 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4194 if (max_char < upper_bound)
4195 new_max = upper_bound;
4196 /* Ensure we track at least that we left ASCII space. */
4197 if (new_max < 128)
4198 new_max = 128;
4199 break;
4200 case 3:
4201 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4202 always > 255 and <= 65535 and will always need 2 bytes. */
4203 if (max_char < 65535)
4204 new_max = 65535;
4205 break;
4206 case 4:
4207 /* Code point will be above 0xFFFF for sure in this case. */
4208 new_max = 65537;
4209 break;
4210 /* Internal error, this should be caught by the first if */
4211 case 1:
4212 default:
4213 assert(0 && "Impossible case in utf8_max_char_and_size");
4214 err = 1;
4215 }
4216 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004217 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 --n;
4219 /* Check if the follow up chars are all valid continuation bytes */
4220 if (n >= 1) {
4221 const unsigned char *cont;
4222 if ((p + n) >= end) {
4223 if (consumed == 0)
4224 /* incomplete data, non-incremental decoding */
4225 err = 1;
4226 break;
4227 }
4228 for (cont = p + 1; cont < (p + n); ++cont) {
4229 if ((*cont & 0xc0) != 0x80) {
4230 err = 1;
4231 break;
4232 }
4233 }
4234 p += n;
4235 }
4236 else
4237 err = 1;
4238 max_char = new_max;
4239 }
4240 }
4241
4242 if (unicode_size)
4243 *unicode_size = char_count;
4244 if (has_errors)
4245 *has_errors = err;
4246 return max_char;
4247}
4248
4249/* Similar to PyUnicode_WRITE but can also write into wstr field
4250 of the legacy unicode representation */
4251#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4252 do { \
4253 const int k_ = (kind); \
4254 if (k_ == PyUnicode_WCHAR_KIND) \
4255 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4256 else if (k_ == PyUnicode_1BYTE_KIND) \
4257 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4258 else if (k_ == PyUnicode_2BYTE_KIND) \
4259 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4260 else \
4261 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4262 } while (0)
4263
Alexander Belopolsky40018472011-02-26 01:02:56 +00004264PyObject *
4265PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266 Py_ssize_t size,
4267 const char *errors,
4268 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004269{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004272 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 Py_ssize_t startinpos;
4274 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004275 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004277 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 PyObject *errorHandler = NULL;
4279 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280 Py_UCS4 maxchar = 0;
4281 Py_ssize_t unicode_size;
4282 Py_ssize_t i;
4283 int kind;
4284 void *data;
4285 int has_errors;
4286 Py_UNICODE *error_outptr;
4287#if SIZEOF_WCHAR_T == 2
4288 Py_ssize_t wchar_offset = 0;
4289#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290
Walter Dörwald69652032004-09-07 20:24:22 +00004291 if (size == 0) {
4292 if (consumed)
4293 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4297 consumed, &has_errors);
4298 if (has_errors) {
4299 unicode = _PyUnicode_New(size);
4300 if (!unicode)
4301 return NULL;
4302 kind = PyUnicode_WCHAR_KIND;
4303 data = PyUnicode_AS_UNICODE(unicode);
4304 assert(data != NULL);
4305 }
4306 else {
4307 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4308 if (!unicode)
4309 return NULL;
4310 /* When the string is ASCII only, just use memcpy and return.
4311 unicode_size may be != size if there is an incomplete UTF-8
4312 sequence at the end of the ASCII block. */
4313 if (maxchar < 128 && size == unicode_size) {
4314 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4315 return (PyObject *)unicode;
4316 }
4317 kind = PyUnicode_KIND(unicode);
4318 data = PyUnicode_DATA(unicode);
4319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004321 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004323 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324
4325 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004326 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327
4328 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004329 /* Fast path for runs of ASCII characters. Given that common UTF-8
4330 input will consist of an overwhelming majority of ASCII
4331 characters, we try to optimize for this case by checking
4332 as many characters as a C 'long' can contain.
4333 First, check if we can do an aligned read, as most CPUs have
4334 a penalty for unaligned reads.
4335 */
4336 if (!((size_t) s & LONG_PTR_MASK)) {
4337 /* Help register allocation */
4338 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004340 while (_s < aligned_end) {
4341 /* Read a whole long at a time (either 4 or 8 bytes),
4342 and do a fast unrolled copy if it only contains ASCII
4343 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 unsigned long value = *(unsigned long *) _s;
4345 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004346 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004347 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4348 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4350 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004351#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4353 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4354 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4355 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004356#endif
4357 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004359 }
4360 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004361 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004362 if (s == e)
4363 break;
4364 ch = (unsigned char)*s;
4365 }
4366 }
4367
4368 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 s++;
4371 continue;
4372 }
4373
4374 n = utf8_code_length[ch];
4375
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004376 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 if (consumed)
4378 break;
4379 else {
4380 errmsg = "unexpected end of data";
4381 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004382 endinpos = startinpos+1;
4383 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4384 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 goto utf8Error;
4386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388
4389 switch (n) {
4390
4391 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004392 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 startinpos = s-starts;
4394 endinpos = startinpos+1;
4395 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396
4397 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004398 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 startinpos = s-starts;
4400 endinpos = startinpos+1;
4401 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402
4403 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004404 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004405 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004407 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 goto utf8Error;
4409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004411 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004412 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 break;
4414
4415 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004416 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4417 will result in surrogates in range d800-dfff. Surrogates are
4418 not valid UTF-8 so they are rejected.
4419 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4420 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004421 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004422 (s[2] & 0xc0) != 0x80 ||
4423 ((unsigned char)s[0] == 0xE0 &&
4424 (unsigned char)s[1] < 0xA0) ||
4425 ((unsigned char)s[0] == 0xED &&
4426 (unsigned char)s[1] > 0x9F)) {
4427 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004429 endinpos = startinpos + 1;
4430
4431 /* if s[1] first two bits are 1 and 0, then the invalid
4432 continuation byte is s[2], so increment endinpos by 1,
4433 if not, s[1] is invalid and endinpos doesn't need to
4434 be incremented. */
4435 if ((s[1] & 0xC0) == 0x80)
4436 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto utf8Error;
4438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004440 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442 break;
4443
4444 case 4:
4445 if ((s[1] & 0xc0) != 0x80 ||
4446 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004447 (s[3] & 0xc0) != 0x80 ||
4448 ((unsigned char)s[0] == 0xF0 &&
4449 (unsigned char)s[1] < 0x90) ||
4450 ((unsigned char)s[0] == 0xF4 &&
4451 (unsigned char)s[1] > 0x8F)) {
4452 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004454 endinpos = startinpos + 1;
4455 if ((s[1] & 0xC0) == 0x80) {
4456 endinpos++;
4457 if ((s[2] & 0xC0) == 0x80)
4458 endinpos++;
4459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 goto utf8Error;
4461 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004462 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004463 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4464 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004466 /* If the string is flexible or we have native UCS-4, write
4467 directly.. */
4468 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4469 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471 else {
4472 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 /* translate from 10000..10FFFF to 0..FFFF */
4475 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 /* high surrogate = top 10 bits added to D800 */
4478 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4479 (Py_UNICODE)(0xD800 + (ch >> 10)));
4480
4481 /* low surrogate = bottom 10 bits added to DC00 */
4482 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4483 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4484 }
4485#if SIZEOF_WCHAR_T == 2
4486 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004487#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 }
4490 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004492
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 /* If this is not yet a resizable string, make it one.. */
4495 if (kind != PyUnicode_WCHAR_KIND) {
4496 const Py_UNICODE *u;
4497 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4498 if (!new_unicode)
4499 goto onError;
4500 u = PyUnicode_AsUnicode((PyObject *)unicode);
4501 if (!u)
4502 goto onError;
4503#if SIZEOF_WCHAR_T == 2
4504 i += wchar_offset;
4505#endif
4506 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4507 Py_DECREF(unicode);
4508 unicode = new_unicode;
4509 kind = 0;
4510 data = PyUnicode_AS_UNICODE(new_unicode);
4511 assert(data != NULL);
4512 }
4513 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 if (unicode_decode_call_errorhandler(
4515 errors, &errorHandler,
4516 "utf8", errmsg,
4517 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004518 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 /* Update data because unicode_decode_call_errorhandler might have
4521 re-created or resized the unicode object. */
4522 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004525 /* Ensure the unicode_size calculation above was correct: */
4526 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4527
Walter Dörwald69652032004-09-07 20:24:22 +00004528 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531 /* Adjust length and ready string when it contained errors and
4532 is of the old resizable kind. */
4533 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004534 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004535 goto onError;
4536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 Py_XDECREF(errorHandler);
4539 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004540#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004541 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 Py_DECREF(unicode);
4543 return NULL;
4544 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004545#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004546 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 return (PyObject *)unicode;
4548
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 Py_XDECREF(errorHandler);
4551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 Py_DECREF(unicode);
4553 return NULL;
4554}
4555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004556#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004557
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004558#ifdef __APPLE__
4559
4560/* Simplified UTF-8 decoder using surrogateescape error handler,
4561 used to decode the command line arguments on Mac OS X. */
4562
4563wchar_t*
4564_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4565{
4566 int n;
4567 const char *e;
4568 wchar_t *unicode, *p;
4569
4570 /* Note: size will always be longer than the resulting Unicode
4571 character count */
4572 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4573 PyErr_NoMemory();
4574 return NULL;
4575 }
4576 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4577 if (!unicode)
4578 return NULL;
4579
4580 /* Unpack UTF-8 encoded data */
4581 p = unicode;
4582 e = s + size;
4583 while (s < e) {
4584 Py_UCS4 ch = (unsigned char)*s;
4585
4586 if (ch < 0x80) {
4587 *p++ = (wchar_t)ch;
4588 s++;
4589 continue;
4590 }
4591
4592 n = utf8_code_length[ch];
4593 if (s + n > e) {
4594 goto surrogateescape;
4595 }
4596
4597 switch (n) {
4598 case 0:
4599 case 1:
4600 goto surrogateescape;
4601
4602 case 2:
4603 if ((s[1] & 0xc0) != 0x80)
4604 goto surrogateescape;
4605 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4606 assert ((ch > 0x007F) && (ch <= 0x07FF));
4607 *p++ = (wchar_t)ch;
4608 break;
4609
4610 case 3:
4611 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4612 will result in surrogates in range d800-dfff. Surrogates are
4613 not valid UTF-8 so they are rejected.
4614 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4615 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4616 if ((s[1] & 0xc0) != 0x80 ||
4617 (s[2] & 0xc0) != 0x80 ||
4618 ((unsigned char)s[0] == 0xE0 &&
4619 (unsigned char)s[1] < 0xA0) ||
4620 ((unsigned char)s[0] == 0xED &&
4621 (unsigned char)s[1] > 0x9F)) {
4622
4623 goto surrogateescape;
4624 }
4625 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4626 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004627 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004628 break;
4629
4630 case 4:
4631 if ((s[1] & 0xc0) != 0x80 ||
4632 (s[2] & 0xc0) != 0x80 ||
4633 (s[3] & 0xc0) != 0x80 ||
4634 ((unsigned char)s[0] == 0xF0 &&
4635 (unsigned char)s[1] < 0x90) ||
4636 ((unsigned char)s[0] == 0xF4 &&
4637 (unsigned char)s[1] > 0x8F)) {
4638 goto surrogateescape;
4639 }
4640 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4641 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4642 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4643
4644#if SIZEOF_WCHAR_T == 4
4645 *p++ = (wchar_t)ch;
4646#else
4647 /* compute and append the two surrogates: */
4648
4649 /* translate from 10000..10FFFF to 0..FFFF */
4650 ch -= 0x10000;
4651
4652 /* high surrogate = top 10 bits added to D800 */
4653 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4654
4655 /* low surrogate = bottom 10 bits added to DC00 */
4656 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4657#endif
4658 break;
4659 }
4660 s += n;
4661 continue;
4662
4663 surrogateescape:
4664 *p++ = 0xDC00 + ch;
4665 s++;
4666 }
4667 *p = L'\0';
4668 return unicode;
4669}
4670
4671#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673/* Primary internal function which creates utf8 encoded bytes objects.
4674
4675 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004676 and allocate exactly as much space needed at the end. Else allocate the
4677 maximum possible needed (4 result bytes per Unicode character), and return
4678 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004679*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004680PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682{
Tim Peters602f7402002-04-27 18:03:26 +00004683#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004684
Guido van Rossum98297ee2007-11-06 21:34:58 +00004685 Py_ssize_t i; /* index into s of next input byte */
4686 PyObject *result; /* result string object */
4687 char *p; /* next free byte in output buffer */
4688 Py_ssize_t nallocated; /* number of result bytes allocated */
4689 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004690 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004691 PyObject *errorHandler = NULL;
4692 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 int kind;
4694 void *data;
4695 Py_ssize_t size;
4696 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4697#if SIZEOF_WCHAR_T == 2
4698 Py_ssize_t wchar_offset = 0;
4699#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701 if (!PyUnicode_Check(unicode)) {
4702 PyErr_BadArgument();
4703 return NULL;
4704 }
4705
4706 if (PyUnicode_READY(unicode) == -1)
4707 return NULL;
4708
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004709 if (PyUnicode_UTF8(unicode))
4710 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4711 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712
4713 kind = PyUnicode_KIND(unicode);
4714 data = PyUnicode_DATA(unicode);
4715 size = PyUnicode_GET_LENGTH(unicode);
4716
Tim Peters602f7402002-04-27 18:03:26 +00004717 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
Tim Peters602f7402002-04-27 18:03:26 +00004719 if (size <= MAX_SHORT_UNICHARS) {
4720 /* Write into the stack buffer; nallocated can't overflow.
4721 * At the end, we'll allocate exactly as much heap space as it
4722 * turns out we need.
4723 */
4724 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004725 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004726 p = stackbuf;
4727 }
4728 else {
4729 /* Overallocate on the heap, and give the excess back at the end. */
4730 nallocated = size * 4;
4731 if (nallocated / 4 != size) /* overflow! */
4732 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004733 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004734 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004735 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004736 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004737 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004738
Tim Peters602f7402002-04-27 18:03:26 +00004739 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004741
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004742 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004743 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004747 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004748 *p++ = (char)(0xc0 | (ch >> 6));
4749 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004750 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751 Py_ssize_t newpos;
4752 PyObject *rep;
4753 Py_ssize_t repsize, k, startpos;
4754 startpos = i-1;
4755#if SIZEOF_WCHAR_T == 2
4756 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004757#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 rep = unicode_encode_call_errorhandler(
4759 errors, &errorHandler, "utf-8", "surrogates not allowed",
4760 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4761 &exc, startpos, startpos+1, &newpos);
4762 if (!rep)
4763 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 if (PyBytes_Check(rep))
4766 repsize = PyBytes_GET_SIZE(rep);
4767 else
4768 repsize = PyUnicode_GET_SIZE(rep);
4769
4770 if (repsize > 4) {
4771 Py_ssize_t offset;
4772
4773 if (result == NULL)
4774 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004775 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4779 /* integer overflow */
4780 PyErr_NoMemory();
4781 goto error;
4782 }
4783 nallocated += repsize - 4;
4784 if (result != NULL) {
4785 if (_PyBytes_Resize(&result, nallocated) < 0)
4786 goto error;
4787 } else {
4788 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004789 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 goto error;
4791 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4792 }
4793 p = PyBytes_AS_STRING(result) + offset;
4794 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796 if (PyBytes_Check(rep)) {
4797 char *prep = PyBytes_AS_STRING(rep);
4798 for(k = repsize; k > 0; k--)
4799 *p++ = *prep++;
4800 } else /* rep is unicode */ {
4801 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4802 Py_UNICODE c;
4803
4804 for(k=0; k<repsize; k++) {
4805 c = prep[k];
4806 if (0x80 <= c) {
4807 raise_encode_exception(&exc, "utf-8",
4808 PyUnicode_AS_UNICODE(unicode),
4809 size, i-1, i,
4810 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004811 goto error;
4812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004814 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004817 } else if (ch < 0x10000) {
4818 *p++ = (char)(0xe0 | (ch >> 12));
4819 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4820 *p++ = (char)(0x80 | (ch & 0x3f));
4821 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004822 /* Encode UCS4 Unicode ordinals */
4823 *p++ = (char)(0xf0 | (ch >> 18));
4824 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4825 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4826 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827#if SIZEOF_WCHAR_T == 2
4828 wchar_offset++;
4829#endif
Tim Peters602f7402002-04-27 18:03:26 +00004830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004832
Guido van Rossum98297ee2007-11-06 21:34:58 +00004833 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004834 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004835 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004836 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004837 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004838 }
4839 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004840 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004841 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004842 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004843 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004846 Py_XDECREF(errorHandler);
4847 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004848 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004849 error:
4850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
4852 Py_XDECREF(result);
4853 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004854
Tim Peters602f7402002-04-27 18:03:26 +00004855#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856}
4857
Alexander Belopolsky40018472011-02-26 01:02:56 +00004858PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004859PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4860 Py_ssize_t size,
4861 const char *errors)
4862{
4863 PyObject *v, *unicode;
4864
4865 unicode = PyUnicode_FromUnicode(s, size);
4866 if (unicode == NULL)
4867 return NULL;
4868 v = _PyUnicode_AsUTF8String(unicode, errors);
4869 Py_DECREF(unicode);
4870 return v;
4871}
4872
4873PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004874PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877}
4878
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879/* --- UTF-32 Codec ------------------------------------------------------- */
4880
4881PyObject *
4882PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 Py_ssize_t size,
4884 const char *errors,
4885 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886{
4887 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4888}
4889
4890PyObject *
4891PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 Py_ssize_t size,
4893 const char *errors,
4894 int *byteorder,
4895 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896{
4897 const char *starts = s;
4898 Py_ssize_t startinpos;
4899 Py_ssize_t endinpos;
4900 Py_ssize_t outpos;
4901 PyUnicodeObject *unicode;
4902 Py_UNICODE *p;
4903#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004904 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004905 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906#else
4907 const int pairs = 0;
4908#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004909 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910 int bo = 0; /* assume native ordering by default */
4911 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004912 /* Offsets from q for retrieving bytes in the right order. */
4913#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4914 int iorder[] = {0, 1, 2, 3};
4915#else
4916 int iorder[] = {3, 2, 1, 0};
4917#endif
4918 PyObject *errorHandler = NULL;
4919 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004920
Walter Dörwald41980ca2007-08-16 21:55:45 +00004921 q = (unsigned char *)s;
4922 e = q + size;
4923
4924 if (byteorder)
4925 bo = *byteorder;
4926
4927 /* Check for BOM marks (U+FEFF) in the input and adjust current
4928 byte order setting accordingly. In native mode, the leading BOM
4929 mark is skipped, in all other modes, it is copied to the output
4930 stream as-is (giving a ZWNBSP character). */
4931 if (bo == 0) {
4932 if (size >= 4) {
4933 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 if (bom == 0x0000FEFF) {
4937 q += 4;
4938 bo = -1;
4939 }
4940 else if (bom == 0xFFFE0000) {
4941 q += 4;
4942 bo = 1;
4943 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 if (bom == 0x0000FEFF) {
4946 q += 4;
4947 bo = 1;
4948 }
4949 else if (bom == 0xFFFE0000) {
4950 q += 4;
4951 bo = -1;
4952 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 }
4956
4957 if (bo == -1) {
4958 /* force LE */
4959 iorder[0] = 0;
4960 iorder[1] = 1;
4961 iorder[2] = 2;
4962 iorder[3] = 3;
4963 }
4964 else if (bo == 1) {
4965 /* force BE */
4966 iorder[0] = 3;
4967 iorder[1] = 2;
4968 iorder[2] = 1;
4969 iorder[3] = 0;
4970 }
4971
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004972 /* On narrow builds we split characters outside the BMP into two
4973 codepoints => count how much extra space we need. */
4974#ifndef Py_UNICODE_WIDE
4975 for (qq = q; qq < e; qq += 4)
4976 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4977 pairs++;
4978#endif
4979
4980 /* This might be one to much, because of a BOM */
4981 unicode = _PyUnicode_New((size+3)/4+pairs);
4982 if (!unicode)
4983 return NULL;
4984 if (size == 0)
4985 return (PyObject *)unicode;
4986
4987 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004988 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004989
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 Py_UCS4 ch;
4992 /* remaining bytes at the end? (size should be divisible by 4) */
4993 if (e-q<4) {
4994 if (consumed)
4995 break;
4996 errmsg = "truncated data";
4997 startinpos = ((const char *)q)-starts;
4998 endinpos = ((const char *)e)-starts;
4999 goto utf32Error;
5000 /* The remaining input chars are ignored if the callback
5001 chooses to skip the input */
5002 }
5003 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5004 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 if (ch >= 0x110000)
5007 {
5008 errmsg = "codepoint not in range(0x110000)";
5009 startinpos = ((const char *)q)-starts;
5010 endinpos = startinpos+4;
5011 goto utf32Error;
5012 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 if (ch >= 0x10000)
5015 {
5016 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5017 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5018 }
5019 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 *p++ = ch;
5022 q += 4;
5023 continue;
5024 utf32Error:
5025 outpos = p-PyUnicode_AS_UNICODE(unicode);
5026 if (unicode_decode_call_errorhandler(
5027 errors, &errorHandler,
5028 "utf32", errmsg,
5029 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5030 &unicode, &outpos, &p))
5031 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032 }
5033
5034 if (byteorder)
5035 *byteorder = bo;
5036
5037 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039
5040 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005041 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042 goto onError;
5043
5044 Py_XDECREF(errorHandler);
5045 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005046#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005047 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 Py_DECREF(unicode);
5049 return NULL;
5050 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005051#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005052 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 return (PyObject *)unicode;
5054
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056 Py_DECREF(unicode);
5057 Py_XDECREF(errorHandler);
5058 Py_XDECREF(exc);
5059 return NULL;
5060}
5061
5062PyObject *
5063PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 Py_ssize_t size,
5065 const char *errors,
5066 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005068 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005070 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005072 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073#else
5074 const int pairs = 0;
5075#endif
5076 /* Offsets from p for storing byte pairs in the right order. */
5077#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5078 int iorder[] = {0, 1, 2, 3};
5079#else
5080 int iorder[] = {3, 2, 1, 0};
5081#endif
5082
Benjamin Peterson29060642009-01-31 22:14:21 +00005083#define STORECHAR(CH) \
5084 do { \
5085 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5086 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5087 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5088 p[iorder[0]] = (CH) & 0xff; \
5089 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 } while(0)
5091
5092 /* In narrow builds we can output surrogate pairs as one codepoint,
5093 so we need less space. */
5094#ifndef Py_UNICODE_WIDE
5095 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5097 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5098 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005100 nsize = (size - pairs + (byteorder == 0));
5101 bytesize = nsize * 4;
5102 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005104 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 if (v == NULL)
5106 return NULL;
5107
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005108 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005112 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113
5114 if (byteorder == -1) {
5115 /* force LE */
5116 iorder[0] = 0;
5117 iorder[1] = 1;
5118 iorder[2] = 2;
5119 iorder[3] = 3;
5120 }
5121 else if (byteorder == 1) {
5122 /* force BE */
5123 iorder[0] = 3;
5124 iorder[1] = 2;
5125 iorder[2] = 1;
5126 iorder[3] = 0;
5127 }
5128
5129 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005131#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5133 Py_UCS4 ch2 = *s;
5134 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5135 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5136 s++;
5137 size--;
5138 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005139 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140#endif
5141 STORECHAR(ch);
5142 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005143
5144 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005145 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146#undef STORECHAR
5147}
5148
Alexander Belopolsky40018472011-02-26 01:02:56 +00005149PyObject *
5150PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151{
5152 if (!PyUnicode_Check(unicode)) {
5153 PyErr_BadArgument();
5154 return NULL;
5155 }
5156 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyUnicode_GET_SIZE(unicode),
5158 NULL,
5159 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005160}
5161
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162/* --- UTF-16 Codec ------------------------------------------------------- */
5163
Tim Peters772747b2001-08-09 22:21:55 +00005164PyObject *
5165PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 Py_ssize_t size,
5167 const char *errors,
5168 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Walter Dörwald69652032004-09-07 20:24:22 +00005170 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5171}
5172
Antoine Pitrouab868312009-01-10 15:40:25 +00005173/* Two masks for fast checking of whether a C 'long' may contain
5174 UTF16-encoded surrogate characters. This is an efficient heuristic,
5175 assuming that non-surrogate characters with a code point >= 0x8000 are
5176 rare in most input.
5177 FAST_CHAR_MASK is used when the input is in native byte ordering,
5178 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005179*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005180#if (SIZEOF_LONG == 8)
5181# define FAST_CHAR_MASK 0x8000800080008000L
5182# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5183#elif (SIZEOF_LONG == 4)
5184# define FAST_CHAR_MASK 0x80008000L
5185# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5186#else
5187# error C 'long' size should be either 4 or 8!
5188#endif
5189
Walter Dörwald69652032004-09-07 20:24:22 +00005190PyObject *
5191PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 Py_ssize_t size,
5193 const char *errors,
5194 int *byteorder,
5195 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005196{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198 Py_ssize_t startinpos;
5199 Py_ssize_t endinpos;
5200 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 PyUnicodeObject *unicode;
5202 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005203 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005204 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005205 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005206 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005207 /* Offsets from q for retrieving byte pairs in the right order. */
5208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5209 int ihi = 1, ilo = 0;
5210#else
5211 int ihi = 0, ilo = 1;
5212#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 PyObject *errorHandler = NULL;
5214 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216 /* Note: size will always be longer than the resulting Unicode
5217 character count */
5218 unicode = _PyUnicode_New(size);
5219 if (!unicode)
5220 return NULL;
5221 if (size == 0)
5222 return (PyObject *)unicode;
5223
5224 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005225 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005226 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005227 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005230 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005232 /* Check for BOM marks (U+FEFF) in the input and adjust current
5233 byte order setting accordingly. In native mode, the leading BOM
5234 mark is skipped, in all other modes, it is copied to the output
5235 stream as-is (giving a ZWNBSP character). */
5236 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005237 if (size >= 2) {
5238 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 if (bom == 0xFEFF) {
5241 q += 2;
5242 bo = -1;
5243 }
5244 else if (bom == 0xFFFE) {
5245 q += 2;
5246 bo = 1;
5247 }
Tim Petersced69f82003-09-16 20:30:58 +00005248#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 if (bom == 0xFEFF) {
5250 q += 2;
5251 bo = 1;
5252 }
5253 else if (bom == 0xFFFE) {
5254 q += 2;
5255 bo = -1;
5256 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005257#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Tim Peters772747b2001-08-09 22:21:55 +00005261 if (bo == -1) {
5262 /* force LE */
5263 ihi = 1;
5264 ilo = 0;
5265 }
5266 else if (bo == 1) {
5267 /* force BE */
5268 ihi = 0;
5269 ilo = 1;
5270 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005271#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5272 native_ordering = ilo < ihi;
5273#else
5274 native_ordering = ilo > ihi;
5275#endif
Tim Peters772747b2001-08-09 22:21:55 +00005276
Antoine Pitrouab868312009-01-10 15:40:25 +00005277 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005278 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005280 /* First check for possible aligned read of a C 'long'. Unaligned
5281 reads are more expensive, better to defer to another iteration. */
5282 if (!((size_t) q & LONG_PTR_MASK)) {
5283 /* Fast path for runs of non-surrogate chars. */
5284 register const unsigned char *_q = q;
5285 Py_UNICODE *_p = p;
5286 if (native_ordering) {
5287 /* Native ordering is simple: as long as the input cannot
5288 possibly contain a surrogate char, do an unrolled copy
5289 of several 16-bit code points to the target object.
5290 The non-surrogate check is done on several input bytes
5291 at a time (as many as a C 'long' can contain). */
5292 while (_q < aligned_end) {
5293 unsigned long data = * (unsigned long *) _q;
5294 if (data & FAST_CHAR_MASK)
5295 break;
5296 _p[0] = ((unsigned short *) _q)[0];
5297 _p[1] = ((unsigned short *) _q)[1];
5298#if (SIZEOF_LONG == 8)
5299 _p[2] = ((unsigned short *) _q)[2];
5300 _p[3] = ((unsigned short *) _q)[3];
5301#endif
5302 _q += SIZEOF_LONG;
5303 _p += SIZEOF_LONG / 2;
5304 }
5305 }
5306 else {
5307 /* Byteswapped ordering is similar, but we must decompose
5308 the copy bytewise, and take care of zero'ing out the
5309 upper bytes if the target object is in 32-bit units
5310 (that is, in UCS-4 builds). */
5311 while (_q < aligned_end) {
5312 unsigned long data = * (unsigned long *) _q;
5313 if (data & SWAPPED_FAST_CHAR_MASK)
5314 break;
5315 /* Zero upper bytes in UCS-4 builds */
5316#if (Py_UNICODE_SIZE > 2)
5317 _p[0] = 0;
5318 _p[1] = 0;
5319#if (SIZEOF_LONG == 8)
5320 _p[2] = 0;
5321 _p[3] = 0;
5322#endif
5323#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005324 /* Issue #4916; UCS-4 builds on big endian machines must
5325 fill the two last bytes of each 4-byte unit. */
5326#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5327# define OFF 2
5328#else
5329# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005330#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005331 ((unsigned char *) _p)[OFF + 1] = _q[0];
5332 ((unsigned char *) _p)[OFF + 0] = _q[1];
5333 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5334 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5335#if (SIZEOF_LONG == 8)
5336 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5337 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5338 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5339 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5340#endif
5341#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005342 _q += SIZEOF_LONG;
5343 _p += SIZEOF_LONG / 2;
5344 }
5345 }
5346 p = _p;
5347 q = _q;
5348 if (q >= e)
5349 break;
5350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352
Benjamin Peterson14339b62009-01-31 16:36:08 +00005353 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005354
5355 if (ch < 0xD800 || ch > 0xDFFF) {
5356 *p++ = ch;
5357 continue;
5358 }
5359
5360 /* UTF-16 code pair: */
5361 if (q > e) {
5362 errmsg = "unexpected end of data";
5363 startinpos = (((const char *)q) - 2) - starts;
5364 endinpos = ((const char *)e) + 1 - starts;
5365 goto utf16Error;
5366 }
5367 if (0xD800 <= ch && ch <= 0xDBFF) {
5368 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5369 q += 2;
5370 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005371#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 *p++ = ch;
5373 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005374#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005376#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 continue;
5378 }
5379 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005380 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 startinpos = (((const char *)q)-4)-starts;
5382 endinpos = startinpos+2;
5383 goto utf16Error;
5384 }
5385
Benjamin Peterson14339b62009-01-31 16:36:08 +00005386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 errmsg = "illegal encoding";
5388 startinpos = (((const char *)q)-2)-starts;
5389 endinpos = startinpos+2;
5390 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005391
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 utf16Error:
5393 outpos = p - PyUnicode_AS_UNICODE(unicode);
5394 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005395 errors,
5396 &errorHandler,
5397 "utf16", errmsg,
5398 &starts,
5399 (const char **)&e,
5400 &startinpos,
5401 &endinpos,
5402 &exc,
5403 (const char **)&q,
5404 &unicode,
5405 &outpos,
5406 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005409 /* remaining byte at the end? (size should be even) */
5410 if (e == q) {
5411 if (!consumed) {
5412 errmsg = "truncated data";
5413 startinpos = ((const char *)q) - starts;
5414 endinpos = ((const char *)e) + 1 - starts;
5415 outpos = p - PyUnicode_AS_UNICODE(unicode);
5416 if (unicode_decode_call_errorhandler(
5417 errors,
5418 &errorHandler,
5419 "utf16", errmsg,
5420 &starts,
5421 (const char **)&e,
5422 &startinpos,
5423 &endinpos,
5424 &exc,
5425 (const char **)&q,
5426 &unicode,
5427 &outpos,
5428 &p))
5429 goto onError;
5430 /* The remaining input chars are ignored if the callback
5431 chooses to skip the input */
5432 }
5433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434
5435 if (byteorder)
5436 *byteorder = bo;
5437
Walter Dörwald69652032004-09-07 20:24:22 +00005438 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005440
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005442 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 goto onError;
5444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 Py_XDECREF(errorHandler);
5446 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005447#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005448 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 Py_DECREF(unicode);
5450 return NULL;
5451 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005452#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005453 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 return (PyObject *)unicode;
5455
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 Py_XDECREF(errorHandler);
5459 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 return NULL;
5461}
5462
Antoine Pitrouab868312009-01-10 15:40:25 +00005463#undef FAST_CHAR_MASK
5464#undef SWAPPED_FAST_CHAR_MASK
5465
Tim Peters772747b2001-08-09 22:21:55 +00005466PyObject *
5467PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t size,
5469 const char *errors,
5470 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005473 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005474 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005475#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005476 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005477#else
5478 const int pairs = 0;
5479#endif
Tim Peters772747b2001-08-09 22:21:55 +00005480 /* Offsets from p for storing byte pairs in the right order. */
5481#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5482 int ihi = 1, ilo = 0;
5483#else
5484 int ihi = 0, ilo = 1;
5485#endif
5486
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#define STORECHAR(CH) \
5488 do { \
5489 p[ihi] = ((CH) >> 8) & 0xff; \
5490 p[ilo] = (CH) & 0xff; \
5491 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005492 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005494#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005495 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 if (s[i] >= 0x10000)
5497 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005498#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005499 /* 2 * (size + pairs + (byteorder == 0)) */
5500 if (size > PY_SSIZE_T_MAX ||
5501 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005503 nsize = size + pairs + (byteorder == 0);
5504 bytesize = nsize * 2;
5505 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005507 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 if (v == NULL)
5509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005511 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005514 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005515 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005516
5517 if (byteorder == -1) {
5518 /* force LE */
5519 ihi = 1;
5520 ilo = 0;
5521 }
5522 else if (byteorder == 1) {
5523 /* force BE */
5524 ihi = 0;
5525 ilo = 1;
5526 }
5527
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005528 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 Py_UNICODE ch = *s++;
5530 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005531#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 if (ch >= 0x10000) {
5533 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5534 ch = 0xD800 | ((ch-0x10000) >> 10);
5535 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005536#endif
Tim Peters772747b2001-08-09 22:21:55 +00005537 STORECHAR(ch);
5538 if (ch2)
5539 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005540 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005541
5542 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005543 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005544#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545}
5546
Alexander Belopolsky40018472011-02-26 01:02:56 +00005547PyObject *
5548PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
5550 if (!PyUnicode_Check(unicode)) {
5551 PyErr_BadArgument();
5552 return NULL;
5553 }
5554 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 PyUnicode_GET_SIZE(unicode),
5556 NULL,
5557 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558}
5559
5560/* --- Unicode Escape Codec ----------------------------------------------- */
5561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5563 if all the escapes in the string make it still a valid ASCII string.
5564 Returns -1 if any escapes were found which cause the string to
5565 pop out of ASCII range. Otherwise returns the length of the
5566 required buffer to hold the string.
5567 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005568static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5570{
5571 const unsigned char *p = (const unsigned char *)s;
5572 const unsigned char *end = p + size;
5573 Py_ssize_t length = 0;
5574
5575 if (size < 0)
5576 return -1;
5577
5578 for (; p < end; ++p) {
5579 if (*p > 127) {
5580 /* Non-ASCII */
5581 return -1;
5582 }
5583 else if (*p != '\\') {
5584 /* Normal character */
5585 ++length;
5586 }
5587 else {
5588 /* Backslash-escape, check next char */
5589 ++p;
5590 /* Escape sequence reaches till end of string or
5591 non-ASCII follow-up. */
5592 if (p >= end || *p > 127)
5593 return -1;
5594 switch (*p) {
5595 case '\n':
5596 /* backslash + \n result in zero characters */
5597 break;
5598 case '\\': case '\'': case '\"':
5599 case 'b': case 'f': case 't':
5600 case 'n': case 'r': case 'v': case 'a':
5601 ++length;
5602 break;
5603 case '0': case '1': case '2': case '3':
5604 case '4': case '5': case '6': case '7':
5605 case 'x': case 'u': case 'U': case 'N':
5606 /* these do not guarantee ASCII characters */
5607 return -1;
5608 default:
5609 /* count the backslash + the other character */
5610 length += 2;
5611 }
5612 }
5613 }
5614 return length;
5615}
5616
5617/* Similar to PyUnicode_WRITE but either write into wstr field
5618 or treat string as ASCII. */
5619#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5620 do { \
5621 if ((kind) != PyUnicode_WCHAR_KIND) \
5622 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5623 else \
5624 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5625 } while (0)
5626
5627#define WRITE_WSTR(buf, index, value) \
5628 assert(kind == PyUnicode_WCHAR_KIND), \
5629 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5630
5631
Fredrik Lundh06d12682001-01-24 07:59:11 +00005632static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005633
Alexander Belopolsky40018472011-02-26 01:02:56 +00005634PyObject *
5635PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005636 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t startinpos;
5641 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005646 char* message;
5647 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 PyObject *errorHandler = NULL;
5649 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 Py_ssize_t ascii_length;
5651 Py_ssize_t i;
5652 int kind;
5653 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 ascii_length = length_of_escaped_ascii_string(s, size);
5656
5657 /* After length_of_escaped_ascii_string() there are two alternatives,
5658 either the string is pure ASCII with named escapes like \n, etc.
5659 and we determined it's exact size (common case)
5660 or it contains \x, \u, ... escape sequences. then we create a
5661 legacy wchar string and resize it at the end of this function. */
5662 if (ascii_length >= 0) {
5663 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5664 if (!v)
5665 goto onError;
5666 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5667 kind = PyUnicode_1BYTE_KIND;
5668 data = PyUnicode_DATA(v);
5669 }
5670 else {
5671 /* Escaped strings will always be longer than the resulting
5672 Unicode string, so we start with size here and then reduce the
5673 length after conversion to the true value.
5674 (but if the error callback returns a long replacement string
5675 we'll have to allocate more space) */
5676 v = _PyUnicode_New(size);
5677 if (!v)
5678 goto onError;
5679 kind = PyUnicode_WCHAR_KIND;
5680 data = PyUnicode_AS_UNICODE(v);
5681 }
5682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (size == 0)
5684 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 while (s < end) {
5689 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005690 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693 if (kind == PyUnicode_WCHAR_KIND) {
5694 assert(i < _PyUnicode_WSTR_LENGTH(v));
5695 }
5696 else {
5697 /* The only case in which i == ascii_length is a backslash
5698 followed by a newline. */
5699 assert(i <= ascii_length);
5700 }
5701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 /* Non-escape characters are interpreted as Unicode ordinals */
5703 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 continue;
5706 }
5707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 /* \ - Escapes */
5710 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005711 c = *s++;
5712 if (s > end)
5713 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714
5715 if (kind == PyUnicode_WCHAR_KIND) {
5716 assert(i < _PyUnicode_WSTR_LENGTH(v));
5717 }
5718 else {
5719 /* The only case in which i == ascii_length is a backslash
5720 followed by a newline. */
5721 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5722 }
5723
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005724 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005728 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5729 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5730 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5731 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5732 /* FF */
5733 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5734 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5735 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5736 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5737 /* VT */
5738 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5739 /* BEL, not classic C */
5740 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 case '0': case '1': case '2': case '3':
5744 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005745 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005746 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005747 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005748 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005749 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 break;
5753
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 /* hex escapes */
5755 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005757 digits = 2;
5758 message = "truncated \\xXX escape";
5759 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005763 digits = 4;
5764 message = "truncated \\uXXXX escape";
5765 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005768 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005769 digits = 8;
5770 message = "truncated \\UXXXXXXXX escape";
5771 hexescape:
5772 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 if (s+digits>end) {
5775 endinpos = size;
5776 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 errors, &errorHandler,
5778 "unicodeescape", "end of string in escape sequence",
5779 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005782 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 goto nextByte;
5784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 for (j = 0; j < digits; ++j) {
5786 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005787 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 endinpos = (s+j+1)-starts;
5789 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 errors, &errorHandler,
5792 "unicodeescape", message,
5793 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005795 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005796 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005798 }
5799 chr = (chr<<4) & ~0xF;
5800 if (c >= '0' && c <= '9')
5801 chr += c - '0';
5802 else if (c >= 'a' && c <= 'f')
5803 chr += 10 + c - 'a';
5804 else
5805 chr += 10 + c - 'A';
5806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005807 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005808 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 /* _decoding_error will have already written into the
5810 target buffer. */
5811 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005813 /* when we get here, chr is a 32-bit unicode character */
5814 if (chr <= 0xffff)
5815 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005817 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005818 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005819 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005820#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005821 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005822#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005823 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5825 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005826#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005827 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005828 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005829 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 errors, &errorHandler,
5832 "unicodeescape", "illegal Unicode character",
5833 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005835 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005836 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005837 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005838 break;
5839
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005841 case 'N':
5842 message = "malformed \\N character escape";
5843 if (ucnhash_CAPI == NULL) {
5844 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005845 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5846 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005847 if (ucnhash_CAPI == NULL)
5848 goto ucnhashError;
5849 }
5850 if (*s == '{') {
5851 const char *start = s+1;
5852 /* look for the closing brace */
5853 while (*s != '}' && s < end)
5854 s++;
5855 if (s > start && s < end && *s == '}') {
5856 /* found a name. look it up in the unicode database */
5857 message = "unknown Unicode character name";
5858 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005859 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005860 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005861 goto store;
5862 }
5863 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 errors, &errorHandler,
5868 "unicodeescape", message,
5869 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005871 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005872 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005873 break;
5874
5875 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005876 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005877 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 message = "\\ at end of string";
5879 s--;
5880 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 errors, &errorHandler,
5884 "unicodeescape", message,
5885 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005886 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005887 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005888 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005889 }
5890 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005891 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5892 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005893 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005894 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005899 /* Ensure the length prediction worked in case of ASCII strings */
5900 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5901
Victor Stinnerfe226c02011-10-03 03:52:20 +02005902 if (kind == PyUnicode_WCHAR_KIND)
5903 {
5904 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5905 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005906 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005907 Py_XDECREF(errorHandler);
5908 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005909#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005910 if (_PyUnicode_READY_REPLACE(&v)) {
5911 Py_DECREF(v);
5912 return NULL;
5913 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005914#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005915 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005917
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005919 PyErr_SetString(
5920 PyExc_UnicodeError,
5921 "\\N escapes not supported (can't load unicodedata module)"
5922 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005923 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 Py_XDECREF(errorHandler);
5925 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005926 return NULL;
5927
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 Py_XDECREF(errorHandler);
5931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 return NULL;
5933}
5934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005935#undef WRITE_ASCII_OR_WSTR
5936#undef WRITE_WSTR
5937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938/* Return a Unicode-Escape string version of the Unicode object.
5939
5940 If quotes is true, the string is enclosed in u"" or u'' quotes as
5941 appropriate.
5942
5943*/
5944
Alexander Belopolsky40018472011-02-26 01:02:56 +00005945PyObject *
5946PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005947 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005952#ifdef Py_UNICODE_WIDE
5953 const Py_ssize_t expandsize = 10;
5954#else
5955 const Py_ssize_t expandsize = 6;
5956#endif
5957
Thomas Wouters89f507f2006-12-13 04:49:30 +00005958 /* XXX(nnorwitz): rather than over-allocating, it would be
5959 better to choose a different scheme. Perhaps scan the
5960 first N-chars of the string and allocate based on that size.
5961 */
5962 /* Initial allocation is based on the longest-possible unichr
5963 escape.
5964
5965 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5966 unichr, so in this case it's the longest unichr escape. In
5967 narrow (UTF-16) builds this is five chars per source unichr
5968 since there are two unichrs in the surrogate pair, so in narrow
5969 (UTF-16) builds it's not the longest unichr escape.
5970
5971 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5972 so in the narrow (UTF-16) build case it's the longest unichr
5973 escape.
5974 */
5975
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005976 if (size == 0)
5977 return PyBytes_FromStringAndSize(NULL, 0);
5978
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005979 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005982 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 2
5984 + expandsize*size
5985 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 if (repr == NULL)
5987 return NULL;
5988
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005989 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 while (size-- > 0) {
5992 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005993
Walter Dörwald79e913e2007-05-12 11:08:06 +00005994 /* Escape backslashes */
5995 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 *p++ = '\\';
5997 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005998 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005999 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006000
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00006001#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006002 /* Map 21-bit characters to '\U00xxxxxx' */
6003 else if (ch >= 0x10000) {
6004 *p++ = '\\';
6005 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006006 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6007 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6008 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6009 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6010 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6011 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6012 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6013 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006015 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006016#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6018 else if (ch >= 0xD800 && ch < 0xDC00) {
6019 Py_UNICODE ch2;
6020 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 ch2 = *s++;
6023 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006024 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6026 *p++ = '\\';
6027 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006028 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6029 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6030 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6031 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6032 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6033 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6034 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6035 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 continue;
6037 }
6038 /* Fall through: isolated surrogates are copied as-is */
6039 s--;
6040 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006041 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006042#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006043
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006045 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 *p++ = '\\';
6047 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006048 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6049 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6050 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6051 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006053
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006054 /* Map special whitespace to '\t', \n', '\r' */
6055 else if (ch == '\t') {
6056 *p++ = '\\';
6057 *p++ = 't';
6058 }
6059 else if (ch == '\n') {
6060 *p++ = '\\';
6061 *p++ = 'n';
6062 }
6063 else if (ch == '\r') {
6064 *p++ = '\\';
6065 *p++ = 'r';
6066 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006067
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006068 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006069 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006071 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006072 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6073 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006074 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 /* Copy everything else as-is */
6077 else
6078 *p++ = (char) ch;
6079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006081 assert(p - PyBytes_AS_STRING(repr) > 0);
6082 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6083 return NULL;
6084 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085}
6086
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087PyObject *
6088PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006090 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 if (!PyUnicode_Check(unicode)) {
6092 PyErr_BadArgument();
6093 return NULL;
6094 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006095 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6096 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006097 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
6100/* --- Raw Unicode Escape Codec ------------------------------------------- */
6101
Alexander Belopolsky40018472011-02-26 01:02:56 +00006102PyObject *
6103PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006104 Py_ssize_t size,
6105 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t startinpos;
6109 Py_ssize_t endinpos;
6110 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 const char *end;
6114 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 PyObject *errorHandler = NULL;
6116 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006117
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 /* Escaped strings will always be longer than the resulting
6119 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 length after conversion to the true value. (But decoding error
6121 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 v = _PyUnicode_New(size);
6123 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 end = s + size;
6129 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 unsigned char c;
6131 Py_UCS4 x;
6132 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006133 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 /* Non-escape characters are interpreted as Unicode ordinals */
6136 if (*s != '\\') {
6137 *p++ = (unsigned char)*s++;
6138 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 startinpos = s-starts;
6141
6142 /* \u-escapes are only interpreted iff the number of leading
6143 backslashes if odd */
6144 bs = s;
6145 for (;s < end;) {
6146 if (*s != '\\')
6147 break;
6148 *p++ = (unsigned char)*s++;
6149 }
6150 if (((s - bs) & 1) == 0 ||
6151 s >= end ||
6152 (*s != 'u' && *s != 'U')) {
6153 continue;
6154 }
6155 p--;
6156 count = *s=='u' ? 4 : 8;
6157 s++;
6158
6159 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6160 outpos = p-PyUnicode_AS_UNICODE(v);
6161 for (x = 0, i = 0; i < count; ++i, ++s) {
6162 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006163 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 endinpos = s-starts;
6165 if (unicode_decode_call_errorhandler(
6166 errors, &errorHandler,
6167 "rawunicodeescape", "truncated \\uXXXX",
6168 &starts, &end, &startinpos, &endinpos, &exc, &s,
6169 &v, &outpos, &p))
6170 goto onError;
6171 goto nextByte;
6172 }
6173 x = (x<<4) & ~0xF;
6174 if (c >= '0' && c <= '9')
6175 x += c - '0';
6176 else if (c >= 'a' && c <= 'f')
6177 x += 10 + c - 'a';
6178 else
6179 x += 10 + c - 'A';
6180 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006181 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 /* UCS-2 character */
6183 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006184 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 /* UCS-4 character. Either store directly, or as
6186 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006187#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006189#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 x -= 0x10000L;
6191 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6192 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006193#endif
6194 } else {
6195 endinpos = s-starts;
6196 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006197 if (unicode_decode_call_errorhandler(
6198 errors, &errorHandler,
6199 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 &starts, &end, &startinpos, &endinpos, &exc, &s,
6201 &v, &outpos, &p))
6202 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 nextByte:
6205 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006207 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 Py_XDECREF(errorHandler);
6210 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006211#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006212 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006213 Py_DECREF(v);
6214 return NULL;
6215 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006216#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006217 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006219
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 Py_XDECREF(errorHandler);
6223 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 return NULL;
6225}
6226
Alexander Belopolsky40018472011-02-26 01:02:56 +00006227PyObject *
6228PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006229 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006231 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 char *p;
6233 char *q;
6234
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006235#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006236 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006237#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006238 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006239#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006240
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006241 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006244 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 if (repr == NULL)
6246 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006247 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006248 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006250 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 while (size-- > 0) {
6252 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006253#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 /* Map 32-bit characters to '\Uxxxxxxxx' */
6255 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006256 *p++ = '\\';
6257 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006258 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6259 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6260 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6261 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6262 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6263 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6264 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6265 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006266 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006267 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006268#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6270 if (ch >= 0xD800 && ch < 0xDC00) {
6271 Py_UNICODE ch2;
6272 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 ch2 = *s++;
6275 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006276 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6278 *p++ = '\\';
6279 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006280 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6281 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6282 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6283 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6284 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6285 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6286 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6287 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 continue;
6289 }
6290 /* Fall through: isolated surrogates are copied as-is */
6291 s--;
6292 size++;
6293 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006294#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 /* Map 16-bit characters to '\uxxxx' */
6296 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 *p++ = '\\';
6298 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006299 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6300 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6301 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6302 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 /* Copy everything else as-is */
6305 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 *p++ = (char) ch;
6307 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006308 size = p - q;
6309
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006310 assert(size > 0);
6311 if (_PyBytes_Resize(&repr, size) < 0)
6312 return NULL;
6313 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314}
6315
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316PyObject *
6317PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006319 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006321 PyErr_BadArgument();
6322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006324 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6325 PyUnicode_GET_SIZE(unicode));
6326
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006327 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328}
6329
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330/* --- Unicode Internal Codec ------------------------------------------- */
6331
Alexander Belopolsky40018472011-02-26 01:02:56 +00006332PyObject *
6333_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006334 Py_ssize_t size,
6335 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006336{
6337 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006338 Py_ssize_t startinpos;
6339 Py_ssize_t endinpos;
6340 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006341 PyUnicodeObject *v;
6342 Py_UNICODE *p;
6343 const char *end;
6344 const char *reason;
6345 PyObject *errorHandler = NULL;
6346 PyObject *exc = NULL;
6347
Neal Norwitzd43069c2006-01-08 01:12:10 +00006348#ifdef Py_UNICODE_WIDE
6349 Py_UNICODE unimax = PyUnicode_GetMax();
6350#endif
6351
Thomas Wouters89f507f2006-12-13 04:49:30 +00006352 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006353 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6354 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006356 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6357 as string was created with the old API. */
6358 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006360 p = PyUnicode_AS_UNICODE(v);
6361 end = s + size;
6362
6363 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006364 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006365 /* We have to sanity check the raw data, otherwise doom looms for
6366 some malformed UCS-4 data. */
6367 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006368#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006369 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006370#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006371 end-s < Py_UNICODE_SIZE
6372 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374 startinpos = s - starts;
6375 if (end-s < Py_UNICODE_SIZE) {
6376 endinpos = end-starts;
6377 reason = "truncated input";
6378 }
6379 else {
6380 endinpos = s - starts + Py_UNICODE_SIZE;
6381 reason = "illegal code point (> 0x10FFFF)";
6382 }
6383 outpos = p - PyUnicode_AS_UNICODE(v);
6384 if (unicode_decode_call_errorhandler(
6385 errors, &errorHandler,
6386 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006387 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006388 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006389 goto onError;
6390 }
6391 }
6392 else {
6393 p++;
6394 s += Py_UNICODE_SIZE;
6395 }
6396 }
6397
Victor Stinnerfe226c02011-10-03 03:52:20 +02006398 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399 goto onError;
6400 Py_XDECREF(errorHandler);
6401 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006402#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006403 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006404 Py_DECREF(v);
6405 return NULL;
6406 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006407#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006408 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006409 return (PyObject *)v;
6410
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 Py_XDECREF(v);
6413 Py_XDECREF(errorHandler);
6414 Py_XDECREF(exc);
6415 return NULL;
6416}
6417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418/* --- Latin-1 Codec ------------------------------------------------------ */
6419
Alexander Belopolsky40018472011-02-26 01:02:56 +00006420PyObject *
6421PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006422 Py_ssize_t size,
6423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006426 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006430static void
6431make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006432 const char *encoding,
6433 const Py_UNICODE *unicode, Py_ssize_t size,
6434 Py_ssize_t startpos, Py_ssize_t endpos,
6435 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 *exceptionObject = PyUnicodeEncodeError_Create(
6439 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 }
6441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6443 goto onError;
6444 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6445 goto onError;
6446 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6447 goto onError;
6448 return;
6449 onError:
6450 Py_DECREF(*exceptionObject);
6451 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
6453}
6454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006456static void
6457raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006458 const char *encoding,
6459 const Py_UNICODE *unicode, Py_ssize_t size,
6460 Py_ssize_t startpos, Py_ssize_t endpos,
6461 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462{
6463 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467}
6468
6469/* error handling callback helper:
6470 build arguments, call the callback and check the arguments,
6471 put the result into newpos and return the replacement string, which
6472 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006473static PyObject *
6474unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006475 PyObject **errorHandler,
6476 const char *encoding, const char *reason,
6477 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6478 Py_ssize_t startpos, Py_ssize_t endpos,
6479 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006481 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482
6483 PyObject *restuple;
6484 PyObject *resunicode;
6485
6486 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 }
6491
6492 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496
6497 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006501 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006502 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 Py_DECREF(restuple);
6504 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006506 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 &resunicode, newpos)) {
6508 Py_DECREF(restuple);
6509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006511 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6512 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6513 Py_DECREF(restuple);
6514 return NULL;
6515 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006518 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6520 Py_DECREF(restuple);
6521 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006522 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 Py_INCREF(resunicode);
6524 Py_DECREF(restuple);
6525 return resunicode;
6526}
6527
Alexander Belopolsky40018472011-02-26 01:02:56 +00006528static PyObject *
6529unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006530 Py_ssize_t size,
6531 const char *errors,
6532 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006533{
6534 /* output object */
6535 PyObject *res;
6536 /* pointers to the beginning and end+1 of input */
6537 const Py_UNICODE *startp = p;
6538 const Py_UNICODE *endp = p + size;
6539 /* pointer to the beginning of the unencodable characters */
6540 /* const Py_UNICODE *badp = NULL; */
6541 /* pointer into the output */
6542 char *str;
6543 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006544 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006545 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6546 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 PyObject *errorHandler = NULL;
6548 PyObject *exc = NULL;
6549 /* the following variable is used for caching string comparisons
6550 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6551 int known_errorHandler = -1;
6552
6553 /* allocate enough for a simple encoding without
6554 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006555 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006556 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006557 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006559 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006560 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561 ressize = size;
6562
6563 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 /* can we encode this? */
6567 if (c<limit) {
6568 /* no overflow check, because we know that the space is enough */
6569 *str++ = (char)c;
6570 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 else {
6573 Py_ssize_t unicodepos = p-startp;
6574 Py_ssize_t requiredsize;
6575 PyObject *repunicode;
6576 Py_ssize_t repsize;
6577 Py_ssize_t newpos;
6578 Py_ssize_t respos;
6579 Py_UNICODE *uni2;
6580 /* startpos for collecting unencodable chars */
6581 const Py_UNICODE *collstart = p;
6582 const Py_UNICODE *collend = p;
6583 /* find all unecodable characters */
6584 while ((collend < endp) && ((*collend)>=limit))
6585 ++collend;
6586 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6587 if (known_errorHandler==-1) {
6588 if ((errors==NULL) || (!strcmp(errors, "strict")))
6589 known_errorHandler = 1;
6590 else if (!strcmp(errors, "replace"))
6591 known_errorHandler = 2;
6592 else if (!strcmp(errors, "ignore"))
6593 known_errorHandler = 3;
6594 else if (!strcmp(errors, "xmlcharrefreplace"))
6595 known_errorHandler = 4;
6596 else
6597 known_errorHandler = 0;
6598 }
6599 switch (known_errorHandler) {
6600 case 1: /* strict */
6601 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6602 goto onError;
6603 case 2: /* replace */
6604 while (collstart++<collend)
6605 *str++ = '?'; /* fall through */
6606 case 3: /* ignore */
6607 p = collend;
6608 break;
6609 case 4: /* xmlcharrefreplace */
6610 respos = str - PyBytes_AS_STRING(res);
6611 /* determine replacement size (temporarily (mis)uses p) */
6612 for (p = collstart, repsize = 0; p < collend; ++p) {
6613 if (*p<10)
6614 repsize += 2+1+1;
6615 else if (*p<100)
6616 repsize += 2+2+1;
6617 else if (*p<1000)
6618 repsize += 2+3+1;
6619 else if (*p<10000)
6620 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006621#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 else
6623 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006624#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 else if (*p<100000)
6626 repsize += 2+5+1;
6627 else if (*p<1000000)
6628 repsize += 2+6+1;
6629 else
6630 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006631#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 }
6633 requiredsize = respos+repsize+(endp-collend);
6634 if (requiredsize > ressize) {
6635 if (requiredsize<2*ressize)
6636 requiredsize = 2*ressize;
6637 if (_PyBytes_Resize(&res, requiredsize))
6638 goto onError;
6639 str = PyBytes_AS_STRING(res) + respos;
6640 ressize = requiredsize;
6641 }
6642 /* generate replacement (temporarily (mis)uses p) */
6643 for (p = collstart; p < collend; ++p) {
6644 str += sprintf(str, "&#%d;", (int)*p);
6645 }
6646 p = collend;
6647 break;
6648 default:
6649 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6650 encoding, reason, startp, size, &exc,
6651 collstart-startp, collend-startp, &newpos);
6652 if (repunicode == NULL)
6653 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006654 if (PyBytes_Check(repunicode)) {
6655 /* Directly copy bytes result to output. */
6656 repsize = PyBytes_Size(repunicode);
6657 if (repsize > 1) {
6658 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006659 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006660 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6661 Py_DECREF(repunicode);
6662 goto onError;
6663 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006664 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006665 ressize += repsize-1;
6666 }
6667 memcpy(str, PyBytes_AsString(repunicode), repsize);
6668 str += repsize;
6669 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006670 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006671 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 /* need more space? (at least enough for what we
6674 have+the replacement+the rest of the string, so
6675 we won't have to check space for encodable characters) */
6676 respos = str - PyBytes_AS_STRING(res);
6677 repsize = PyUnicode_GET_SIZE(repunicode);
6678 requiredsize = respos+repsize+(endp-collend);
6679 if (requiredsize > ressize) {
6680 if (requiredsize<2*ressize)
6681 requiredsize = 2*ressize;
6682 if (_PyBytes_Resize(&res, requiredsize)) {
6683 Py_DECREF(repunicode);
6684 goto onError;
6685 }
6686 str = PyBytes_AS_STRING(res) + respos;
6687 ressize = requiredsize;
6688 }
6689 /* check if there is anything unencodable in the replacement
6690 and copy it to the output */
6691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6692 c = *uni2;
6693 if (c >= limit) {
6694 raise_encode_exception(&exc, encoding, startp, size,
6695 unicodepos, unicodepos+1, reason);
6696 Py_DECREF(repunicode);
6697 goto onError;
6698 }
6699 *str = (char)c;
6700 }
6701 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006702 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006704 }
6705 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006706 /* Resize if we allocated to much */
6707 size = str - PyBytes_AS_STRING(res);
6708 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006709 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006710 if (_PyBytes_Resize(&res, size) < 0)
6711 goto onError;
6712 }
6713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 Py_XDECREF(errorHandler);
6715 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006716 return res;
6717
6718 onError:
6719 Py_XDECREF(res);
6720 Py_XDECREF(errorHandler);
6721 Py_XDECREF(exc);
6722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723}
6724
Alexander Belopolsky40018472011-02-26 01:02:56 +00006725PyObject *
6726PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006727 Py_ssize_t size,
6728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731}
6732
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006734_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735{
6736 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 PyErr_BadArgument();
6738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 if (PyUnicode_READY(unicode) == -1)
6741 return NULL;
6742 /* Fast path: if it is a one-byte string, construct
6743 bytes object directly. */
6744 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6745 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6746 PyUnicode_GET_LENGTH(unicode));
6747 /* Non-Latin-1 characters present. Defer to above function to
6748 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006751 errors);
6752}
6753
6754PyObject*
6755PyUnicode_AsLatin1String(PyObject *unicode)
6756{
6757 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
6760/* --- 7-bit ASCII Codec -------------------------------------------------- */
6761
Alexander Belopolsky40018472011-02-26 01:02:56 +00006762PyObject *
6763PyUnicode_DecodeASCII(const char *s,
6764 Py_ssize_t size,
6765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006769 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006770 Py_ssize_t startinpos;
6771 Py_ssize_t endinpos;
6772 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006774 int has_error;
6775 const unsigned char *p = (const unsigned char *)s;
6776 const unsigned char *end = p + size;
6777 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778 PyObject *errorHandler = NULL;
6779 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006782 if (size == 1 && (unsigned char)s[0] < 128)
6783 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006784
Victor Stinner702c7342011-10-05 13:50:52 +02006785 has_error = 0;
6786 while (p < end && !has_error) {
6787 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6788 an explanation. */
6789 if (!((size_t) p & LONG_PTR_MASK)) {
6790 /* Help register allocation */
6791 register const unsigned char *_p = p;
6792 while (_p < aligned_end) {
6793 unsigned long value = *(unsigned long *) _p;
6794 if (value & ASCII_CHAR_MASK) {
6795 has_error = 1;
6796 break;
6797 }
6798 _p += SIZEOF_LONG;
6799 }
6800 if (_p == end)
6801 break;
6802 if (has_error)
6803 break;
6804 p = _p;
6805 }
6806 if (*p & 0x80) {
6807 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006808 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006809 }
6810 else {
6811 ++p;
6812 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006813 }
Victor Stinner702c7342011-10-05 13:50:52 +02006814 if (!has_error)
6815 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 v = _PyUnicode_New(size);
6818 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006822 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823 e = s + size;
6824 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 register unsigned char c = (unsigned char)*s;
6826 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006827 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 ++s;
6829 }
6830 else {
6831 startinpos = s-starts;
6832 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006833 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 if (unicode_decode_call_errorhandler(
6835 errors, &errorHandler,
6836 "ascii", "ordinal not in range(128)",
6837 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006838 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 goto onError;
6840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 }
Victor Stinner702c7342011-10-05 13:50:52 +02006842 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6843 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006845 Py_XDECREF(errorHandler);
6846 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006847#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006848 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006849 Py_DECREF(v);
6850 return NULL;
6851 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006852#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006853 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006855
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 Py_XDECREF(errorHandler);
6859 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 return NULL;
6861}
6862
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863PyObject *
6864PyUnicode_EncodeASCII(const Py_UNICODE *p,
6865 Py_ssize_t size,
6866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869}
6870
Alexander Belopolsky40018472011-02-26 01:02:56 +00006871PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006872_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
6874 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 PyErr_BadArgument();
6876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006878 if (PyUnicode_READY(unicode) == -1)
6879 return NULL;
6880 /* Fast path: if it is an ASCII-only string, construct bytes object
6881 directly. Else defer to above function to raise the exception. */
6882 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6883 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6884 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006887 errors);
6888}
6889
6890PyObject *
6891PyUnicode_AsASCIIString(PyObject *unicode)
6892{
6893 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Victor Stinner99b95382011-07-04 14:23:54 +02006896#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006897
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006898/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006899
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006900#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901#define NEED_RETRY
6902#endif
6903
Victor Stinner3a50e702011-10-18 21:21:00 +02006904#ifndef WC_ERR_INVALID_CHARS
6905# define WC_ERR_INVALID_CHARS 0x0080
6906#endif
6907
6908static char*
6909code_page_name(UINT code_page, PyObject **obj)
6910{
6911 *obj = NULL;
6912 if (code_page == CP_ACP)
6913 return "mbcs";
6914 if (code_page == CP_UTF7)
6915 return "CP_UTF7";
6916 if (code_page == CP_UTF8)
6917 return "CP_UTF8";
6918
6919 *obj = PyBytes_FromFormat("cp%u", code_page);
6920 if (*obj == NULL)
6921 return NULL;
6922 return PyBytes_AS_STRING(*obj);
6923}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006926is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927{
6928 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006929 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930
Victor Stinner3a50e702011-10-18 21:21:00 +02006931 if (!IsDBCSLeadByteEx(code_page, *curr))
6932 return 0;
6933
6934 prev = CharPrevExA(code_page, s, curr, 0);
6935 if (prev == curr)
6936 return 1;
6937 /* FIXME: This code is limited to "true" double-byte encodings,
6938 as it assumes an incomplete character consists of a single
6939 byte. */
6940 if (curr - prev == 2)
6941 return 1;
6942 if (!IsDBCSLeadByteEx(code_page, *prev))
6943 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944 return 0;
6945}
6946
Victor Stinner3a50e702011-10-18 21:21:00 +02006947static DWORD
6948decode_code_page_flags(UINT code_page)
6949{
6950 if (code_page == CP_UTF7) {
6951 /* The CP_UTF7 decoder only supports flags=0 */
6952 return 0;
6953 }
6954 else
6955 return MB_ERR_INVALID_CHARS;
6956}
6957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 * Decode a byte string from a Windows code page into unicode object in strict
6960 * mode.
6961 *
6962 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6963 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006965static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006966decode_code_page_strict(UINT code_page,
6967 PyUnicodeObject **v,
6968 const char *in,
6969 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970{
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const DWORD flags = decode_code_page_flags(code_page);
6972 Py_UNICODE *out;
6973 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
6975 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 assert(insize > 0);
6977 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6978 if (outsize <= 0)
6979 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
6981 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 /* Create unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 *v = _PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 if (*v == NULL)
6985 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006987 }
6988 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6991 if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006993 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 }
6995
6996 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6998 if (outsize <= 0)
6999 goto error;
7000 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007001
Victor Stinner3a50e702011-10-18 21:21:00 +02007002error:
7003 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7004 return -2;
7005 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007006 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007}
7008
Victor Stinner3a50e702011-10-18 21:21:00 +02007009/*
7010 * Decode a byte string from a code page into unicode object with an error
7011 * handler.
7012 *
7013 * Returns consumed size if succeed, or raise a WindowsError or
7014 * UnicodeDecodeError exception and returns -1 on error.
7015 */
7016static int
7017decode_code_page_errors(UINT code_page,
7018 PyUnicodeObject **v,
7019 const char *in,
7020 int size,
7021 const char *errors)
7022{
7023 const char *startin = in;
7024 const char *endin = in + size;
7025 const DWORD flags = decode_code_page_flags(code_page);
7026 /* Ideally, we should get reason from FormatMessage. This is the Windows
7027 2000 English version of the message. */
7028 const char *reason = "No mapping for the Unicode character exists "
7029 "in the target code page.";
7030 /* each step cannot decode more than 1 character, but a character can be
7031 represented as a surrogate pair */
7032 wchar_t buffer[2], *startout, *out;
7033 int insize, outsize;
7034 PyObject *errorHandler = NULL;
7035 PyObject *exc = NULL;
7036 PyObject *encoding_obj = NULL;
7037 char *encoding;
7038 DWORD err;
7039 int ret = -1;
7040
7041 assert(size > 0);
7042
7043 encoding = code_page_name(code_page, &encoding_obj);
7044 if (encoding == NULL)
7045 return -1;
7046
7047 if (errors == NULL || strcmp(errors, "strict") == 0) {
7048 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7049 UnicodeDecodeError. */
7050 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7051 if (exc != NULL) {
7052 PyCodec_StrictErrors(exc);
7053 Py_CLEAR(exc);
7054 }
7055 goto error;
7056 }
7057
7058 if (*v == NULL) {
7059 /* Create unicode object */
7060 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7061 PyErr_NoMemory();
7062 goto error;
7063 }
7064 *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7065 if (*v == NULL)
7066 goto error;
7067 startout = PyUnicode_AS_UNICODE(*v);
7068 }
7069 else {
7070 /* Extend unicode object */
7071 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7072 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7073 PyErr_NoMemory();
7074 goto error;
7075 }
7076 if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7077 goto error;
7078 startout = PyUnicode_AS_UNICODE(*v) + n;
7079 }
7080
7081 /* Decode the byte string character per character */
7082 out = startout;
7083 while (in < endin)
7084 {
7085 /* Decode a character */
7086 insize = 1;
7087 do
7088 {
7089 outsize = MultiByteToWideChar(code_page, flags,
7090 in, insize,
7091 buffer, Py_ARRAY_LENGTH(buffer));
7092 if (outsize > 0)
7093 break;
7094 err = GetLastError();
7095 if (err != ERROR_NO_UNICODE_TRANSLATION
7096 && err != ERROR_INSUFFICIENT_BUFFER)
7097 {
7098 PyErr_SetFromWindowsErr(0);
7099 goto error;
7100 }
7101 insize++;
7102 }
7103 /* 4=maximum length of a UTF-8 sequence */
7104 while (insize <= 4 && (in + insize) <= endin);
7105
7106 if (outsize <= 0) {
7107 Py_ssize_t startinpos, endinpos, outpos;
7108
7109 startinpos = in - startin;
7110 endinpos = startinpos + 1;
7111 outpos = out - PyUnicode_AS_UNICODE(*v);
7112 if (unicode_decode_call_errorhandler(
7113 errors, &errorHandler,
7114 encoding, reason,
7115 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7116 v, &outpos, &out))
7117 {
7118 goto error;
7119 }
7120 }
7121 else {
7122 in += insize;
7123 memcpy(out, buffer, outsize * sizeof(wchar_t));
7124 out += outsize;
7125 }
7126 }
7127
7128 /* write a NUL character at the end */
7129 *out = 0;
7130
7131 /* Extend unicode object */
7132 outsize = out - startout;
7133 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7134 if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7135 goto error;
7136 ret = 0;
7137
7138error:
7139 Py_XDECREF(encoding_obj);
7140 Py_XDECREF(errorHandler);
7141 Py_XDECREF(exc);
7142 return ret;
7143}
7144
7145/*
7146 * Decode a byte string from a Windows code page into unicode object. If
7147 * 'final' is set, converts trailing lead-byte too.
7148 *
7149 * Returns consumed size if succeed, or raise a WindowsError or
7150 * UnicodeDecodeError exception and returns -1 on error.
7151 */
7152static int
7153decode_code_page(UINT code_page,
7154 PyUnicodeObject **v,
7155 const char *s, int size,
7156 int final, const char *errors)
7157{
7158 int done;
7159
7160 /* Skip trailing lead-byte unless 'final' is set */
7161 if (size == 0) {
7162 if (*v == NULL) {
7163 Py_INCREF(unicode_empty);
7164 *v = (PyUnicodeObject*)unicode_empty;
7165 if (*v == NULL)
7166 return -1;
7167 }
7168 return 0;
7169 }
7170
7171 if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7172 --size;
7173
7174 done = decode_code_page_strict(code_page, v, s, size);
7175 if (done == -2)
7176 done = decode_code_page_errors(code_page, v, s, size, errors);
7177 return done;
7178}
7179
7180static PyObject *
7181decode_code_page_stateful(int code_page,
7182 const char *s,
7183 Py_ssize_t size,
7184 const char *errors,
7185 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186{
7187 PyUnicodeObject *v = NULL;
7188 int done;
7189
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 if (code_page < 0) {
7191 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7192 return NULL;
7193 }
7194
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007195 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197
7198#ifdef NEED_RETRY
7199 retry:
7200 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202 else
7203#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205
7206 if (done < 0) {
7207 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209 }
7210
7211 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213
7214#ifdef NEED_RETRY
7215 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 s += done;
7217 size -= done;
7218 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219 }
7220#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007221
Victor Stinner17efeed2011-10-04 20:05:46 +02007222#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007223 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 Py_DECREF(v);
7225 return NULL;
7226 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007227#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007228 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007229 return (PyObject *)v;
7230}
7231
Alexander Belopolsky40018472011-02-26 01:02:56 +00007232PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007233PyUnicode_DecodeCodePageStateful(int code_page,
7234 const char *s,
7235 Py_ssize_t size,
7236 const char *errors,
7237 Py_ssize_t *consumed)
7238{
7239 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7240}
7241
7242PyObject *
7243PyUnicode_DecodeMBCSStateful(const char *s,
7244 Py_ssize_t size,
7245 const char *errors,
7246 Py_ssize_t *consumed)
7247{
7248 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7249}
7250
7251PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007252PyUnicode_DecodeMBCS(const char *s,
7253 Py_ssize_t size,
7254 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007255{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7257}
7258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259static DWORD
7260encode_code_page_flags(UINT code_page, const char *errors)
7261{
7262 if (code_page == CP_UTF8) {
7263 if (winver.dwMajorVersion >= 6)
7264 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7265 and later */
7266 return WC_ERR_INVALID_CHARS;
7267 else
7268 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7269 return 0;
7270 }
7271 else if (code_page == CP_UTF7) {
7272 /* CP_UTF7 only supports flags=0 */
7273 return 0;
7274 }
7275 else {
7276 if (errors != NULL && strcmp(errors, "replace") == 0)
7277 return 0;
7278 else
7279 return WC_NO_BEST_FIT_CHARS;
7280 }
7281}
7282
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 * Encode a Unicode string to a Windows code page into a byte string in strict
7285 * mode.
7286 *
7287 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7288 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007289 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007290static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007291encode_code_page_strict(UINT code_page, PyObject **outbytes,
7292 const Py_UNICODE *p, const int size,
7293 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007294{
Victor Stinner554f3f02010-06-16 23:33:54 +00007295 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 BOOL *pusedDefaultChar = &usedDefaultChar;
7297 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007298 PyObject *exc = NULL;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 const DWORD flags = encode_code_page_flags(code_page, NULL);
7300 char *out;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 assert(size > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007305 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007307 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007308
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007309 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 outsize = WideCharToMultiByte(code_page, flags,
7311 p, size,
7312 NULL, 0,
7313 NULL, pusedDefaultChar);
7314 if (outsize <= 0)
7315 goto error;
7316 /* If we used a default char, then we failed! */
7317 if (pusedDefaultChar && *pusedDefaultChar)
7318 return -2;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007319
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7323 if (*outbytes == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326 }
7327 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 const Py_ssize_t n = PyBytes_Size(*outbytes);
7330 if (outsize > PY_SSIZE_T_MAX - n) {
7331 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 }
7334 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7335 return -1;
7336 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337 }
7338
7339 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 outsize = WideCharToMultiByte(code_page, flags,
7341 p, size,
7342 out, outsize,
7343 NULL, pusedDefaultChar);
7344 if (outsize <= 0)
7345 goto error;
7346 if (pusedDefaultChar && *pusedDefaultChar)
7347 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007349
Victor Stinner3a50e702011-10-18 21:21:00 +02007350error:
7351 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7352 return -2;
7353 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007354 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007355}
7356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357/*
7358 * Encode a Unicode string to a Windows code page into a byte string using a
7359 * error handler.
7360 *
7361 * Returns consumed characters if succeed, or raise a WindowsError and returns
7362 * -1 on other error.
7363 */
7364static int
7365encode_code_page_errors(UINT code_page, PyObject **outbytes,
7366 const Py_UNICODE *in, const int insize,
7367 const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007368{
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 const DWORD flags = encode_code_page_flags(code_page, errors);
7370 const Py_UNICODE *startin = in;
7371 const Py_UNICODE *endin = in + insize;
7372 /* Ideally, we should get reason from FormatMessage. This is the Windows
7373 2000 English version of the message. */
7374 const char *reason = "invalid character";
7375 /* 4=maximum length of a UTF-8 sequence */
7376 char buffer[4];
7377 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7378 Py_ssize_t outsize;
7379 char *out;
7380 int charsize;
7381 PyObject *errorHandler = NULL;
7382 PyObject *exc = NULL;
7383 PyObject *encoding_obj = NULL;
7384 char *encoding;
7385 int err;
7386 Py_ssize_t startpos, newpos, newoutsize;
7387 PyObject *rep;
7388 int ret = -1;
7389
7390 assert(insize > 0);
7391
7392 encoding = code_page_name(code_page, &encoding_obj);
7393 if (encoding == NULL)
7394 return -1;
7395
7396 if (errors == NULL || strcmp(errors, "strict") == 0) {
7397 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7398 then we raise a UnicodeEncodeError. */
7399 make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
7400 if (exc != NULL) {
7401 PyCodec_StrictErrors(exc);
7402 Py_DECREF(exc);
7403 }
7404 Py_XDECREF(encoding_obj);
7405 return -1;
7406 }
7407
7408 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7409 pusedDefaultChar = &usedDefaultChar;
7410 else
7411 pusedDefaultChar = NULL;
7412
7413 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7414 PyErr_NoMemory();
7415 goto error;
7416 }
7417 outsize = insize * Py_ARRAY_LENGTH(buffer);
7418
7419 if (*outbytes == NULL) {
7420 /* Create string object */
7421 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7422 if (*outbytes == NULL)
7423 goto error;
7424 out = PyBytes_AS_STRING(*outbytes);
7425 }
7426 else {
7427 /* Extend string object */
7428 Py_ssize_t n = PyBytes_Size(*outbytes);
7429 if (n > PY_SSIZE_T_MAX - outsize) {
7430 PyErr_NoMemory();
7431 goto error;
7432 }
7433 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7434 goto error;
7435 out = PyBytes_AS_STRING(*outbytes) + n;
7436 }
7437
7438 /* Encode the string character per character */
7439 while (in < endin)
7440 {
7441 if ((in + 2) <= endin
7442 && 0xD800 <= in[0] && in[0] <= 0xDBFF
7443 && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
7444 charsize = 2;
7445 else
7446 charsize = 1;
7447
7448 outsize = WideCharToMultiByte(code_page, flags,
7449 in, charsize,
7450 buffer, Py_ARRAY_LENGTH(buffer),
7451 NULL, pusedDefaultChar);
7452 if (outsize > 0) {
7453 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7454 {
7455 in += charsize;
7456 memcpy(out, buffer, outsize);
7457 out += outsize;
7458 continue;
7459 }
7460 }
7461 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7462 PyErr_SetFromWindowsErr(0);
7463 goto error;
7464 }
7465
7466 charsize = Py_MAX(charsize - 1, 1);
7467 startpos = in - startin;
7468 rep = unicode_encode_call_errorhandler(
7469 errors, &errorHandler, encoding, reason,
7470 startin, insize, &exc,
7471 startpos, startpos + charsize, &newpos);
7472 if (rep == NULL)
7473 goto error;
7474 in = startin + newpos;
7475
7476 if (PyBytes_Check(rep)) {
7477 outsize = PyBytes_GET_SIZE(rep);
7478 if (outsize != 1) {
7479 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7480 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7481 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7482 Py_DECREF(rep);
7483 goto error;
7484 }
7485 out = PyBytes_AS_STRING(*outbytes) + offset;
7486 }
7487 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7488 out += outsize;
7489 }
7490 else {
7491 Py_ssize_t i;
7492 enum PyUnicode_Kind kind;
7493 void *data;
7494
7495 if (PyUnicode_READY(rep) < 0) {
7496 Py_DECREF(rep);
7497 goto error;
7498 }
7499
7500 outsize = PyUnicode_GET_LENGTH(rep);
7501 if (outsize != 1) {
7502 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7503 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7504 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7505 Py_DECREF(rep);
7506 goto error;
7507 }
7508 out = PyBytes_AS_STRING(*outbytes) + offset;
7509 }
7510 kind = PyUnicode_KIND(rep);
7511 data = PyUnicode_DATA(rep);
7512 for (i=0; i < outsize; i++) {
7513 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7514 if (ch > 127) {
7515 raise_encode_exception(&exc,
7516 encoding,
7517 startin, insize,
7518 startpos, startpos + charsize,
7519 "unable to encode error handler result to ASCII");
7520 Py_DECREF(rep);
7521 goto error;
7522 }
7523 *out = (unsigned char)ch;
7524 out++;
7525 }
7526 }
7527 Py_DECREF(rep);
7528 }
7529 /* write a NUL byte */
7530 *out = 0;
7531 outsize = out - PyBytes_AS_STRING(*outbytes);
7532 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7533 if (_PyBytes_Resize(outbytes, outsize) < 0)
7534 goto error;
7535 ret = 0;
7536
7537error:
7538 Py_XDECREF(encoding_obj);
7539 Py_XDECREF(errorHandler);
7540 Py_XDECREF(exc);
7541 return ret;
7542}
7543
7544/*
7545 * Encode a Unicode string to a Windows code page into a byte string.
7546 *
7547 * Returns consumed characters if succeed, or raise a WindowsError and returns
7548 * -1 on other error.
7549 */
7550static int
7551encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7552 const Py_UNICODE *p, int size,
7553 const char* errors)
7554{
7555 int done;
7556
7557 if (size == 0) {
7558 if (*outbytes == NULL) {
7559 *outbytes = PyBytes_FromStringAndSize(NULL, 0);
7560 if (*outbytes == NULL)
7561 return -1;
7562 }
7563 return 0;
7564 }
7565
7566 done = encode_code_page_strict(code_page, outbytes, p, size, errors);
7567 if (done == -2)
7568 done = encode_code_page_errors(code_page, outbytes, p, size, errors);
7569 return done;
7570}
7571
7572static PyObject *
7573encode_code_page(int code_page,
7574 const Py_UNICODE *p, Py_ssize_t size,
7575 const char *errors)
7576{
7577 PyObject *outbytes = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007578 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007579
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (code_page < 0) {
7581 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7582 return NULL;
7583 }
7584
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007585#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007586 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007587 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007589 else
7590#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007592
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007593 if (ret < 0) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 Py_XDECREF(outbytes);
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007596 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007597
7598#ifdef NEED_RETRY
7599 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007600 p += INT_MAX;
7601 size -= INT_MAX;
7602 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007603 }
7604#endif
7605
Victor Stinner3a50e702011-10-18 21:21:00 +02007606 return outbytes;
7607}
7608
7609PyObject *
7610PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7611 Py_ssize_t size,
7612 const char *errors)
7613{
7614 return encode_code_page(CP_ACP, p, size, errors);
7615}
7616
7617PyObject *
7618PyUnicode_EncodeCodePage(int code_page,
7619 PyObject *unicode,
7620 const char *errors)
7621{
7622 const Py_UNICODE *p;
7623 Py_ssize_t size;
7624 p = PyUnicode_AsUnicodeAndSize(unicode, &size);
7625 if (p == NULL)
7626 return NULL;
7627 return encode_code_page(code_page, p, size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007628}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007629
Alexander Belopolsky40018472011-02-26 01:02:56 +00007630PyObject *
7631PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007632{
7633 if (!PyUnicode_Check(unicode)) {
7634 PyErr_BadArgument();
7635 return NULL;
7636 }
7637 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 PyUnicode_GET_SIZE(unicode),
7639 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007640}
7641
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007642#undef NEED_RETRY
7643
Victor Stinner99b95382011-07-04 14:23:54 +02007644#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007645
Guido van Rossumd57fd912000-03-10 22:53:23 +00007646/* --- Character Mapping Codec -------------------------------------------- */
7647
Alexander Belopolsky40018472011-02-26 01:02:56 +00007648PyObject *
7649PyUnicode_DecodeCharmap(const char *s,
7650 Py_ssize_t size,
7651 PyObject *mapping,
7652 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007654 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007655 Py_ssize_t startinpos;
7656 Py_ssize_t endinpos;
7657 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007659 PyUnicodeObject *v;
7660 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007661 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007662 PyObject *errorHandler = NULL;
7663 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007664 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007665 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007666
Guido van Rossumd57fd912000-03-10 22:53:23 +00007667 /* Default to Latin-1 */
7668 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670
7671 v = _PyUnicode_New(size);
7672 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007674 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007676 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007677 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007678 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 mapstring = PyUnicode_AS_UNICODE(mapping);
7680 maplen = PyUnicode_GET_SIZE(mapping);
7681 while (s < e) {
7682 unsigned char ch = *s;
7683 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007684
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 if (ch < maplen)
7686 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 if (x == 0xfffe) {
7689 /* undefined mapping */
7690 outpos = p-PyUnicode_AS_UNICODE(v);
7691 startinpos = s-starts;
7692 endinpos = startinpos+1;
7693 if (unicode_decode_call_errorhandler(
7694 errors, &errorHandler,
7695 "charmap", "character maps to <undefined>",
7696 &starts, &e, &startinpos, &endinpos, &exc, &s,
7697 &v, &outpos, &p)) {
7698 goto onError;
7699 }
7700 continue;
7701 }
7702 *p++ = x;
7703 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007704 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007705 }
7706 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007707 while (s < e) {
7708 unsigned char ch = *s;
7709 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007710
Benjamin Peterson29060642009-01-31 22:14:21 +00007711 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7712 w = PyLong_FromLong((long)ch);
7713 if (w == NULL)
7714 goto onError;
7715 x = PyObject_GetItem(mapping, w);
7716 Py_DECREF(w);
7717 if (x == NULL) {
7718 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7719 /* No mapping found means: mapping is undefined. */
7720 PyErr_Clear();
7721 x = Py_None;
7722 Py_INCREF(x);
7723 } else
7724 goto onError;
7725 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007726
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 /* Apply mapping */
7728 if (PyLong_Check(x)) {
7729 long value = PyLong_AS_LONG(x);
7730 if (value < 0 || value > 65535) {
7731 PyErr_SetString(PyExc_TypeError,
7732 "character mapping must be in range(65536)");
7733 Py_DECREF(x);
7734 goto onError;
7735 }
7736 *p++ = (Py_UNICODE)value;
7737 }
7738 else if (x == Py_None) {
7739 /* undefined mapping */
7740 outpos = p-PyUnicode_AS_UNICODE(v);
7741 startinpos = s-starts;
7742 endinpos = startinpos+1;
7743 if (unicode_decode_call_errorhandler(
7744 errors, &errorHandler,
7745 "charmap", "character maps to <undefined>",
7746 &starts, &e, &startinpos, &endinpos, &exc, &s,
7747 &v, &outpos, &p)) {
7748 Py_DECREF(x);
7749 goto onError;
7750 }
7751 Py_DECREF(x);
7752 continue;
7753 }
7754 else if (PyUnicode_Check(x)) {
7755 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007756
Benjamin Peterson29060642009-01-31 22:14:21 +00007757 if (targetsize == 1)
7758 /* 1-1 mapping */
7759 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007760
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 else if (targetsize > 1) {
7762 /* 1-n mapping */
7763 if (targetsize > extrachars) {
7764 /* resize first */
7765 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7766 Py_ssize_t needed = (targetsize - extrachars) + \
7767 (targetsize << 2);
7768 extrachars += needed;
7769 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007770 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 PyUnicode_GET_SIZE(v) + needed) < 0) {
7772 Py_DECREF(x);
7773 goto onError;
7774 }
7775 p = PyUnicode_AS_UNICODE(v) + oldpos;
7776 }
7777 Py_UNICODE_COPY(p,
7778 PyUnicode_AS_UNICODE(x),
7779 targetsize);
7780 p += targetsize;
7781 extrachars -= targetsize;
7782 }
7783 /* 1-0 mapping: skip the character */
7784 }
7785 else {
7786 /* wrong return value */
7787 PyErr_SetString(PyExc_TypeError,
7788 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 Py_DECREF(x);
7790 goto onError;
7791 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007792 Py_DECREF(x);
7793 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007795 }
7796 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007797 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007798 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007799 Py_XDECREF(errorHandler);
7800 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007801#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007802 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007803 Py_DECREF(v);
7804 return NULL;
7805 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007806#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007807 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007808 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007809
Benjamin Peterson29060642009-01-31 22:14:21 +00007810 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007811 Py_XDECREF(errorHandler);
7812 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007813 Py_XDECREF(v);
7814 return NULL;
7815}
7816
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007817/* Charmap encoding: the lookup table */
7818
Alexander Belopolsky40018472011-02-26 01:02:56 +00007819struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007820 PyObject_HEAD
7821 unsigned char level1[32];
7822 int count2, count3;
7823 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007824};
7825
7826static PyObject*
7827encoding_map_size(PyObject *obj, PyObject* args)
7828{
7829 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007830 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007831 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007832}
7833
7834static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007835 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 PyDoc_STR("Return the size (in bytes) of this object") },
7837 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007838};
7839
7840static void
7841encoding_map_dealloc(PyObject* o)
7842{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007843 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007844}
7845
7846static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007847 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007848 "EncodingMap", /*tp_name*/
7849 sizeof(struct encoding_map), /*tp_basicsize*/
7850 0, /*tp_itemsize*/
7851 /* methods */
7852 encoding_map_dealloc, /*tp_dealloc*/
7853 0, /*tp_print*/
7854 0, /*tp_getattr*/
7855 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007856 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007857 0, /*tp_repr*/
7858 0, /*tp_as_number*/
7859 0, /*tp_as_sequence*/
7860 0, /*tp_as_mapping*/
7861 0, /*tp_hash*/
7862 0, /*tp_call*/
7863 0, /*tp_str*/
7864 0, /*tp_getattro*/
7865 0, /*tp_setattro*/
7866 0, /*tp_as_buffer*/
7867 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7868 0, /*tp_doc*/
7869 0, /*tp_traverse*/
7870 0, /*tp_clear*/
7871 0, /*tp_richcompare*/
7872 0, /*tp_weaklistoffset*/
7873 0, /*tp_iter*/
7874 0, /*tp_iternext*/
7875 encoding_map_methods, /*tp_methods*/
7876 0, /*tp_members*/
7877 0, /*tp_getset*/
7878 0, /*tp_base*/
7879 0, /*tp_dict*/
7880 0, /*tp_descr_get*/
7881 0, /*tp_descr_set*/
7882 0, /*tp_dictoffset*/
7883 0, /*tp_init*/
7884 0, /*tp_alloc*/
7885 0, /*tp_new*/
7886 0, /*tp_free*/
7887 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007888};
7889
7890PyObject*
7891PyUnicode_BuildEncodingMap(PyObject* string)
7892{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 PyObject *result;
7894 struct encoding_map *mresult;
7895 int i;
7896 int need_dict = 0;
7897 unsigned char level1[32];
7898 unsigned char level2[512];
7899 unsigned char *mlevel1, *mlevel2, *mlevel3;
7900 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007901 int kind;
7902 void *data;
7903 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007905 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 PyErr_BadArgument();
7907 return NULL;
7908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007909 kind = PyUnicode_KIND(string);
7910 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007911 memset(level1, 0xFF, sizeof level1);
7912 memset(level2, 0xFF, sizeof level2);
7913
7914 /* If there isn't a one-to-one mapping of NULL to \0,
7915 or if there are non-BMP characters, we need to use
7916 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918 need_dict = 1;
7919 for (i = 1; i < 256; i++) {
7920 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007921 ch = PyUnicode_READ(kind, data, i);
7922 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007923 need_dict = 1;
7924 break;
7925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007927 /* unmapped character */
7928 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 l1 = ch >> 11;
7930 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007931 if (level1[l1] == 0xFF)
7932 level1[l1] = count2++;
7933 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007934 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007935 }
7936
7937 if (count2 >= 0xFF || count3 >= 0xFF)
7938 need_dict = 1;
7939
7940 if (need_dict) {
7941 PyObject *result = PyDict_New();
7942 PyObject *key, *value;
7943 if (!result)
7944 return NULL;
7945 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007947 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948 if (!key || !value)
7949 goto failed1;
7950 if (PyDict_SetItem(result, key, value) == -1)
7951 goto failed1;
7952 Py_DECREF(key);
7953 Py_DECREF(value);
7954 }
7955 return result;
7956 failed1:
7957 Py_XDECREF(key);
7958 Py_XDECREF(value);
7959 Py_DECREF(result);
7960 return NULL;
7961 }
7962
7963 /* Create a three-level trie */
7964 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7965 16*count2 + 128*count3 - 1);
7966 if (!result)
7967 return PyErr_NoMemory();
7968 PyObject_Init(result, &EncodingMapType);
7969 mresult = (struct encoding_map*)result;
7970 mresult->count2 = count2;
7971 mresult->count3 = count3;
7972 mlevel1 = mresult->level1;
7973 mlevel2 = mresult->level23;
7974 mlevel3 = mresult->level23 + 16*count2;
7975 memcpy(mlevel1, level1, 32);
7976 memset(mlevel2, 0xFF, 16*count2);
7977 memset(mlevel3, 0, 128*count3);
7978 count3 = 0;
7979 for (i = 1; i < 256; i++) {
7980 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007982 /* unmapped character */
7983 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 o1 = PyUnicode_READ(kind, data, i)>>11;
7985 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007986 i2 = 16*mlevel1[o1] + o2;
7987 if (mlevel2[i2] == 0xFF)
7988 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007990 i3 = 128*mlevel2[i2] + o3;
7991 mlevel3[i3] = i;
7992 }
7993 return result;
7994}
7995
7996static int
7997encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7998{
7999 struct encoding_map *map = (struct encoding_map*)mapping;
8000 int l1 = c>>11;
8001 int l2 = (c>>7) & 0xF;
8002 int l3 = c & 0x7F;
8003 int i;
8004
8005#ifdef Py_UNICODE_WIDE
8006 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008 }
8009#endif
8010 if (c == 0)
8011 return 0;
8012 /* level 1*/
8013 i = map->level1[l1];
8014 if (i == 0xFF) {
8015 return -1;
8016 }
8017 /* level 2*/
8018 i = map->level23[16*i+l2];
8019 if (i == 0xFF) {
8020 return -1;
8021 }
8022 /* level 3 */
8023 i = map->level23[16*map->count2 + 128*i + l3];
8024 if (i == 0) {
8025 return -1;
8026 }
8027 return i;
8028}
8029
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008030/* Lookup the character ch in the mapping. If the character
8031 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008032 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008033static PyObject *
8034charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008035{
Christian Heimes217cfd12007-12-02 14:31:20 +00008036 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037 PyObject *x;
8038
8039 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041 x = PyObject_GetItem(mapping, w);
8042 Py_DECREF(w);
8043 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8045 /* No mapping found means: mapping is undefined. */
8046 PyErr_Clear();
8047 x = Py_None;
8048 Py_INCREF(x);
8049 return x;
8050 } else
8051 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008052 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008053 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008054 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008055 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008056 long value = PyLong_AS_LONG(x);
8057 if (value < 0 || value > 255) {
8058 PyErr_SetString(PyExc_TypeError,
8059 "character mapping must be in range(256)");
8060 Py_DECREF(x);
8061 return NULL;
8062 }
8063 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008065 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 /* wrong return value */
8069 PyErr_Format(PyExc_TypeError,
8070 "character mapping must return integer, bytes or None, not %.400s",
8071 x->ob_type->tp_name);
8072 Py_DECREF(x);
8073 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008074 }
8075}
8076
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008078charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008080 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8081 /* exponentially overallocate to minimize reallocations */
8082 if (requiredsize < 2*outsize)
8083 requiredsize = 2*outsize;
8084 if (_PyBytes_Resize(outobj, requiredsize))
8085 return -1;
8086 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087}
8088
Benjamin Peterson14339b62009-01-31 16:36:08 +00008089typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008091} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008092/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008093 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 space is available. Return a new reference to the object that
8095 was put in the output buffer, or Py_None, if the mapping was undefined
8096 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008097 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008098static charmapencode_result
8099charmapencode_output(Py_UNICODE c, PyObject *mapping,
8100 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008101{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102 PyObject *rep;
8103 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008104 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008105
Christian Heimes90aa7642007-12-19 02:45:37 +00008106 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008107 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 if (res == -1)
8110 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 if (outsize<requiredsize)
8112 if (charmapencode_resize(outobj, outpos, requiredsize))
8113 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008114 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 outstart[(*outpos)++] = (char)res;
8116 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008117 }
8118
8119 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008120 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008122 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 Py_DECREF(rep);
8124 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 if (PyLong_Check(rep)) {
8127 Py_ssize_t requiredsize = *outpos+1;
8128 if (outsize<requiredsize)
8129 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8130 Py_DECREF(rep);
8131 return enc_EXCEPTION;
8132 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008133 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008134 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008135 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008136 else {
8137 const char *repchars = PyBytes_AS_STRING(rep);
8138 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8139 Py_ssize_t requiredsize = *outpos+repsize;
8140 if (outsize<requiredsize)
8141 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8142 Py_DECREF(rep);
8143 return enc_EXCEPTION;
8144 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008145 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 memcpy(outstart + *outpos, repchars, repsize);
8147 *outpos += repsize;
8148 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008149 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150 Py_DECREF(rep);
8151 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152}
8153
8154/* handle an error in PyUnicode_EncodeCharmap
8155 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008156static int
8157charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00008158 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008159 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008160 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008161 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162{
8163 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008164 Py_ssize_t repsize;
8165 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008166 Py_UNICODE *uni2;
8167 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008168 Py_ssize_t collstartpos = *inpos;
8169 Py_ssize_t collendpos = *inpos+1;
8170 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008171 char *encoding = "charmap";
8172 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008175 /* find all unencodable characters */
8176 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008178 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 int res = encoding_map_lookup(p[collendpos], mapping);
8180 if (res != -1)
8181 break;
8182 ++collendpos;
8183 continue;
8184 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008185
Benjamin Peterson29060642009-01-31 22:14:21 +00008186 rep = charmapencode_lookup(p[collendpos], mapping);
8187 if (rep==NULL)
8188 return -1;
8189 else if (rep!=Py_None) {
8190 Py_DECREF(rep);
8191 break;
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008195 }
8196 /* cache callback name lookup
8197 * (if not done yet, i.e. it's the first error) */
8198 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 if ((errors==NULL) || (!strcmp(errors, "strict")))
8200 *known_errorHandler = 1;
8201 else if (!strcmp(errors, "replace"))
8202 *known_errorHandler = 2;
8203 else if (!strcmp(errors, "ignore"))
8204 *known_errorHandler = 3;
8205 else if (!strcmp(errors, "xmlcharrefreplace"))
8206 *known_errorHandler = 4;
8207 else
8208 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008209 }
8210 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008211 case 1: /* strict */
8212 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8213 return -1;
8214 case 2: /* replace */
8215 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 x = charmapencode_output('?', mapping, res, respos);
8217 if (x==enc_EXCEPTION) {
8218 return -1;
8219 }
8220 else if (x==enc_FAILED) {
8221 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8222 return -1;
8223 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008224 }
8225 /* fall through */
8226 case 3: /* ignore */
8227 *inpos = collendpos;
8228 break;
8229 case 4: /* xmlcharrefreplace */
8230 /* generate replacement (temporarily (mis)uses p) */
8231 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 char buffer[2+29+1+1];
8233 char *cp;
8234 sprintf(buffer, "&#%d;", (int)p[collpos]);
8235 for (cp = buffer; *cp; ++cp) {
8236 x = charmapencode_output(*cp, mapping, res, respos);
8237 if (x==enc_EXCEPTION)
8238 return -1;
8239 else if (x==enc_FAILED) {
8240 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8241 return -1;
8242 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008243 }
8244 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008245 *inpos = collendpos;
8246 break;
8247 default:
8248 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00008249 encoding, reason, p, size, exceptionObject,
8250 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008251 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008253 if (PyBytes_Check(repunicode)) {
8254 /* Directly copy bytes result to output. */
8255 Py_ssize_t outsize = PyBytes_Size(*res);
8256 Py_ssize_t requiredsize;
8257 repsize = PyBytes_Size(repunicode);
8258 requiredsize = *respos + repsize;
8259 if (requiredsize > outsize)
8260 /* Make room for all additional bytes. */
8261 if (charmapencode_resize(res, respos, requiredsize)) {
8262 Py_DECREF(repunicode);
8263 return -1;
8264 }
8265 memcpy(PyBytes_AsString(*res) + *respos,
8266 PyBytes_AsString(repunicode), repsize);
8267 *respos += repsize;
8268 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008269 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008270 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 /* generate replacement */
8273 repsize = PyUnicode_GET_SIZE(repunicode);
8274 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008275 x = charmapencode_output(*uni2, mapping, res, respos);
8276 if (x==enc_EXCEPTION) {
8277 return -1;
8278 }
8279 else if (x==enc_FAILED) {
8280 Py_DECREF(repunicode);
8281 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8282 return -1;
8283 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284 }
8285 *inpos = newpos;
8286 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287 }
8288 return 0;
8289}
8290
Alexander Belopolsky40018472011-02-26 01:02:56 +00008291PyObject *
8292PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8293 Py_ssize_t size,
8294 PyObject *mapping,
8295 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 /* output object */
8298 PyObject *res = NULL;
8299 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008300 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008302 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008303 PyObject *errorHandler = NULL;
8304 PyObject *exc = NULL;
8305 /* the following variable is used for caching string comparisons
8306 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8307 * 3=ignore, 4=xmlcharrefreplace */
8308 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309
8310 /* Default to Latin-1 */
8311 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008314 /* allocate enough for a simple encoding without
8315 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008316 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 if (res == NULL)
8318 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008319 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008322 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 /* try to encode it */
8324 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
8325 if (x==enc_EXCEPTION) /* error */
8326 goto onError;
8327 if (x==enc_FAILED) { /* unencodable character */
8328 if (charmap_encoding_error(p, size, &inpos, mapping,
8329 &exc,
8330 &known_errorHandler, &errorHandler, errors,
8331 &res, &respos)) {
8332 goto onError;
8333 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008334 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 else
8336 /* done with this character => adjust input position */
8337 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008340 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008341 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008342 if (_PyBytes_Resize(&res, respos) < 0)
8343 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008345 Py_XDECREF(exc);
8346 Py_XDECREF(errorHandler);
8347 return res;
8348
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350 Py_XDECREF(res);
8351 Py_XDECREF(exc);
8352 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008353 return NULL;
8354}
8355
Alexander Belopolsky40018472011-02-26 01:02:56 +00008356PyObject *
8357PyUnicode_AsCharmapString(PyObject *unicode,
8358 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008359{
8360 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008361 PyErr_BadArgument();
8362 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008363 }
8364 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00008365 PyUnicode_GET_SIZE(unicode),
8366 mapping,
8367 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008368}
8369
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008371static void
8372make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374 Py_ssize_t startpos, Py_ssize_t endpos,
8375 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008376{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008377 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008378 *exceptionObject = _PyUnicodeTranslateError_Create(
8379 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008380 }
8381 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8383 goto onError;
8384 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8385 goto onError;
8386 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8387 goto onError;
8388 return;
8389 onError:
8390 Py_DECREF(*exceptionObject);
8391 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008392 }
8393}
8394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008396static void
8397raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008399 Py_ssize_t startpos, Py_ssize_t endpos,
8400 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008401{
8402 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008403 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406}
8407
8408/* error handling callback helper:
8409 build arguments, call the callback and check the arguments,
8410 put the result into newpos and return the replacement string, which
8411 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008412static PyObject *
8413unicode_translate_call_errorhandler(const char *errors,
8414 PyObject **errorHandler,
8415 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008416 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008417 Py_ssize_t startpos, Py_ssize_t endpos,
8418 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008420 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008422 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 PyObject *restuple;
8424 PyObject *resunicode;
8425
8426 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431
8432 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008433 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436
8437 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008442 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 Py_DECREF(restuple);
8444 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 }
8446 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 &resunicode, &i_newpos)) {
8448 Py_DECREF(restuple);
8449 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008450 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008451 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008452 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008453 else
8454 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008456 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8457 Py_DECREF(restuple);
8458 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 Py_INCREF(resunicode);
8461 Py_DECREF(restuple);
8462 return resunicode;
8463}
8464
8465/* Lookup the character ch in the mapping and put the result in result,
8466 which must be decrefed by the caller.
8467 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008468static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008469charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470{
Christian Heimes217cfd12007-12-02 14:31:20 +00008471 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008472 PyObject *x;
8473
8474 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 x = PyObject_GetItem(mapping, w);
8477 Py_DECREF(w);
8478 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008479 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8480 /* No mapping found means: use 1:1 mapping. */
8481 PyErr_Clear();
8482 *result = NULL;
8483 return 0;
8484 } else
8485 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008486 }
8487 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 *result = x;
8489 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008490 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008491 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 long value = PyLong_AS_LONG(x);
8493 long max = PyUnicode_GetMax();
8494 if (value < 0 || value > max) {
8495 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008496 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 Py_DECREF(x);
8498 return -1;
8499 }
8500 *result = x;
8501 return 0;
8502 }
8503 else if (PyUnicode_Check(x)) {
8504 *result = x;
8505 return 0;
8506 }
8507 else {
8508 /* wrong return value */
8509 PyErr_SetString(PyExc_TypeError,
8510 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 Py_DECREF(x);
8512 return -1;
8513 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514}
8515/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 if not reallocate and adjust various state variables.
8517 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008518static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008519charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008520 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008521{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008523 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 /* exponentially overallocate to minimize reallocations */
8525 if (requiredsize < 2 * oldsize)
8526 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8528 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531 }
8532 return 0;
8533}
8534/* lookup the character, put the result in the output string and adjust
8535 various state variables. Return a new reference to the object that
8536 was put in the output buffer in *result, or Py_None, if the mapping was
8537 undefined (in which case no character was written).
8538 The called must decref result.
8539 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008540static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008541charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8542 PyObject *mapping, Py_UCS4 **output,
8543 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008544 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8547 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008550 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008551 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 }
8553 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008555 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008557 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008558 }
8559 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 Py_ssize_t repsize;
8561 if (PyUnicode_READY(*res) == -1)
8562 return -1;
8563 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008564 if (repsize==1) {
8565 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 }
8568 else if (repsize!=0) {
8569 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 Py_ssize_t requiredsize = *opos +
8571 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008572 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_ssize_t i;
8574 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 for(i = 0; i < repsize; i++)
8577 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008579 }
8580 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 return 0;
8583}
8584
Alexander Belopolsky40018472011-02-26 01:02:56 +00008585PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008586_PyUnicode_TranslateCharmap(PyObject *input,
8587 PyObject *mapping,
8588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 /* input object */
8591 char *idata;
8592 Py_ssize_t size, i;
8593 int kind;
8594 /* output buffer */
8595 Py_UCS4 *output = NULL;
8596 Py_ssize_t osize;
8597 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008598 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008600 char *reason = "character maps to <undefined>";
8601 PyObject *errorHandler = NULL;
8602 PyObject *exc = NULL;
8603 /* the following variable is used for caching string comparisons
8604 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8605 * 3=ignore, 4=xmlcharrefreplace */
8606 int known_errorHandler = -1;
8607
Guido van Rossumd57fd912000-03-10 22:53:23 +00008608 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 PyErr_BadArgument();
8610 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008613 if (PyUnicode_READY(input) == -1)
8614 return NULL;
8615 idata = (char*)PyUnicode_DATA(input);
8616 kind = PyUnicode_KIND(input);
8617 size = PyUnicode_GET_LENGTH(input);
8618 i = 0;
8619
8620 if (size == 0) {
8621 Py_INCREF(input);
8622 return input;
8623 }
8624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 /* allocate enough for a simple 1:1 translation without
8626 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 osize = size;
8628 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8629 opos = 0;
8630 if (output == NULL) {
8631 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008633 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008636 /* try to encode it */
8637 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 if (charmaptranslate_output(input, i, mapping,
8639 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 Py_XDECREF(x);
8641 goto onError;
8642 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008643 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008645 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 else { /* untranslatable character */
8647 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8648 Py_ssize_t repsize;
8649 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008650 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 Py_ssize_t collstart = i;
8653 Py_ssize_t collend = i+1;
8654 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655
Benjamin Peterson29060642009-01-31 22:14:21 +00008656 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008657 while (collend < size) {
8658 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 goto onError;
8660 Py_XDECREF(x);
8661 if (x!=Py_None)
8662 break;
8663 ++collend;
8664 }
8665 /* cache callback name lookup
8666 * (if not done yet, i.e. it's the first error) */
8667 if (known_errorHandler==-1) {
8668 if ((errors==NULL) || (!strcmp(errors, "strict")))
8669 known_errorHandler = 1;
8670 else if (!strcmp(errors, "replace"))
8671 known_errorHandler = 2;
8672 else if (!strcmp(errors, "ignore"))
8673 known_errorHandler = 3;
8674 else if (!strcmp(errors, "xmlcharrefreplace"))
8675 known_errorHandler = 4;
8676 else
8677 known_errorHandler = 0;
8678 }
8679 switch (known_errorHandler) {
8680 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008681 raise_translate_exception(&exc, input, collstart,
8682 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008683 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 case 2: /* replace */
8685 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 for (coll = collstart; coll<collend; coll++)
8687 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 /* fall through */
8689 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 break;
8692 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 /* generate replacement (temporarily (mis)uses i) */
8694 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008695 char buffer[2+29+1+1];
8696 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008697 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8698 if (charmaptranslate_makespace(&output, &osize,
8699 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 goto onError;
8701 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008702 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 break;
8706 default:
8707 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 reason, input, &exc,
8709 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008710 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 goto onError;
8712 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 repsize = PyUnicode_GET_LENGTH(repunicode);
8714 if (charmaptranslate_makespace(&output, &osize,
8715 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 Py_DECREF(repunicode);
8717 goto onError;
8718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008719 for (uni2 = 0; repsize-->0; ++uni2)
8720 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8721 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008724 }
8725 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8727 if (!res)
8728 goto onError;
8729 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 Py_XDECREF(exc);
8731 Py_XDECREF(errorHandler);
8732 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008735 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 Py_XDECREF(exc);
8737 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 return NULL;
8739}
8740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741/* Deprecated. Use PyUnicode_Translate instead. */
8742PyObject *
8743PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8744 Py_ssize_t size,
8745 PyObject *mapping,
8746 const char *errors)
8747{
8748 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8749 if (!unicode)
8750 return NULL;
8751 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8752}
8753
Alexander Belopolsky40018472011-02-26 01:02:56 +00008754PyObject *
8755PyUnicode_Translate(PyObject *str,
8756 PyObject *mapping,
8757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758{
8759 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008760
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 str = PyUnicode_FromObject(str);
8762 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 Py_DECREF(str);
8766 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008767
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008769 Py_XDECREF(str);
8770 return NULL;
8771}
Tim Petersced69f82003-09-16 20:30:58 +00008772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008773static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008774fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008775{
8776 /* No need to call PyUnicode_READY(self) because this function is only
8777 called as a callback from fixup() which does it already. */
8778 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8779 const int kind = PyUnicode_KIND(self);
8780 void *data = PyUnicode_DATA(self);
8781 Py_UCS4 maxchar = 0, ch, fixed;
8782 Py_ssize_t i;
8783
8784 for (i = 0; i < len; ++i) {
8785 ch = PyUnicode_READ(kind, data, i);
8786 fixed = 0;
8787 if (ch > 127) {
8788 if (Py_UNICODE_ISSPACE(ch))
8789 fixed = ' ';
8790 else {
8791 const int decimal = Py_UNICODE_TODECIMAL(ch);
8792 if (decimal >= 0)
8793 fixed = '0' + decimal;
8794 }
8795 if (fixed != 0) {
8796 if (fixed > maxchar)
8797 maxchar = fixed;
8798 PyUnicode_WRITE(kind, data, i, fixed);
8799 }
8800 else if (ch > maxchar)
8801 maxchar = ch;
8802 }
8803 else if (ch > maxchar)
8804 maxchar = ch;
8805 }
8806
8807 return maxchar;
8808}
8809
8810PyObject *
8811_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8812{
8813 if (!PyUnicode_Check(unicode)) {
8814 PyErr_BadInternalCall();
8815 return NULL;
8816 }
8817 if (PyUnicode_READY(unicode) == -1)
8818 return NULL;
8819 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8820 /* If the string is already ASCII, just return the same string */
8821 Py_INCREF(unicode);
8822 return unicode;
8823 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008824 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008825}
8826
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008827PyObject *
8828PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8829 Py_ssize_t length)
8830{
8831 PyObject *result;
8832 Py_UNICODE *p; /* write pointer into result */
8833 Py_ssize_t i;
8834 /* Copy to a new string */
8835 result = (PyObject *)_PyUnicode_New(length);
8836 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8837 if (result == NULL)
8838 return result;
8839 p = PyUnicode_AS_UNICODE(result);
8840 /* Iterate over code points */
8841 for (i = 0; i < length; i++) {
8842 Py_UNICODE ch =s[i];
8843 if (ch > 127) {
8844 int decimal = Py_UNICODE_TODECIMAL(ch);
8845 if (decimal >= 0)
8846 p[i] = '0' + decimal;
8847 }
8848 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008849#ifndef DONT_MAKE_RESULT_READY
8850 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 Py_DECREF(result);
8852 return NULL;
8853 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008854#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008855 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008856 return result;
8857}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008858/* --- Decimal Encoder ---------------------------------------------------- */
8859
Alexander Belopolsky40018472011-02-26 01:02:56 +00008860int
8861PyUnicode_EncodeDecimal(Py_UNICODE *s,
8862 Py_ssize_t length,
8863 char *output,
8864 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008865{
8866 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008867 PyObject *errorHandler = NULL;
8868 PyObject *exc = NULL;
8869 const char *encoding = "decimal";
8870 const char *reason = "invalid decimal Unicode string";
8871 /* the following variable is used for caching string comparisons
8872 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8873 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008874
8875 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 PyErr_BadArgument();
8877 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008878 }
8879
8880 p = s;
8881 end = s + length;
8882 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 register Py_UNICODE ch = *p;
8884 int decimal;
8885 PyObject *repunicode;
8886 Py_ssize_t repsize;
8887 Py_ssize_t newpos;
8888 Py_UNICODE *uni2;
8889 Py_UNICODE *collstart;
8890 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008891
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008893 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 ++p;
8895 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 decimal = Py_UNICODE_TODECIMAL(ch);
8898 if (decimal >= 0) {
8899 *output++ = '0' + decimal;
8900 ++p;
8901 continue;
8902 }
8903 if (0 < ch && ch < 256) {
8904 *output++ = (char)ch;
8905 ++p;
8906 continue;
8907 }
8908 /* All other characters are considered unencodable */
8909 collstart = p;
8910 collend = p+1;
8911 while (collend < end) {
8912 if ((0 < *collend && *collend < 256) ||
8913 !Py_UNICODE_ISSPACE(*collend) ||
8914 Py_UNICODE_TODECIMAL(*collend))
8915 break;
8916 }
8917 /* cache callback name lookup
8918 * (if not done yet, i.e. it's the first error) */
8919 if (known_errorHandler==-1) {
8920 if ((errors==NULL) || (!strcmp(errors, "strict")))
8921 known_errorHandler = 1;
8922 else if (!strcmp(errors, "replace"))
8923 known_errorHandler = 2;
8924 else if (!strcmp(errors, "ignore"))
8925 known_errorHandler = 3;
8926 else if (!strcmp(errors, "xmlcharrefreplace"))
8927 known_errorHandler = 4;
8928 else
8929 known_errorHandler = 0;
8930 }
8931 switch (known_errorHandler) {
8932 case 1: /* strict */
8933 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8934 goto onError;
8935 case 2: /* replace */
8936 for (p = collstart; p < collend; ++p)
8937 *output++ = '?';
8938 /* fall through */
8939 case 3: /* ignore */
8940 p = collend;
8941 break;
8942 case 4: /* xmlcharrefreplace */
8943 /* generate replacement (temporarily (mis)uses p) */
8944 for (p = collstart; p < collend; ++p)
8945 output += sprintf(output, "&#%d;", (int)*p);
8946 p = collend;
8947 break;
8948 default:
8949 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8950 encoding, reason, s, length, &exc,
8951 collstart-s, collend-s, &newpos);
8952 if (repunicode == NULL)
8953 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008954 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008955 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008956 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8957 Py_DECREF(repunicode);
8958 goto onError;
8959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 /* generate replacement */
8961 repsize = PyUnicode_GET_SIZE(repunicode);
8962 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8963 Py_UNICODE ch = *uni2;
8964 if (Py_UNICODE_ISSPACE(ch))
8965 *output++ = ' ';
8966 else {
8967 decimal = Py_UNICODE_TODECIMAL(ch);
8968 if (decimal >= 0)
8969 *output++ = '0' + decimal;
8970 else if (0 < ch && ch < 256)
8971 *output++ = (char)ch;
8972 else {
8973 Py_DECREF(repunicode);
8974 raise_encode_exception(&exc, encoding,
8975 s, length, collstart-s, collend-s, reason);
8976 goto onError;
8977 }
8978 }
8979 }
8980 p = s + newpos;
8981 Py_DECREF(repunicode);
8982 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008983 }
8984 /* 0-terminate the output string */
8985 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008986 Py_XDECREF(exc);
8987 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008988 return 0;
8989
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008991 Py_XDECREF(exc);
8992 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008993 return -1;
8994}
8995
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996/* --- Helpers ------------------------------------------------------------ */
8997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008998static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008999any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009000 Py_ssize_t start,
9001 Py_ssize_t end)
9002{
9003 int kind1, kind2, kind;
9004 void *buf1, *buf2;
9005 Py_ssize_t len1, len2, result;
9006
9007 kind1 = PyUnicode_KIND(s1);
9008 kind2 = PyUnicode_KIND(s2);
9009 kind = kind1 > kind2 ? kind1 : kind2;
9010 buf1 = PyUnicode_DATA(s1);
9011 buf2 = PyUnicode_DATA(s2);
9012 if (kind1 != kind)
9013 buf1 = _PyUnicode_AsKind(s1, kind);
9014 if (!buf1)
9015 return -2;
9016 if (kind2 != kind)
9017 buf2 = _PyUnicode_AsKind(s2, kind);
9018 if (!buf2) {
9019 if (kind1 != kind) PyMem_Free(buf1);
9020 return -2;
9021 }
9022 len1 = PyUnicode_GET_LENGTH(s1);
9023 len2 = PyUnicode_GET_LENGTH(s2);
9024
Victor Stinner794d5672011-10-10 03:21:36 +02009025 if (direction > 0) {
9026 switch(kind) {
9027 case PyUnicode_1BYTE_KIND:
9028 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9029 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9030 else
9031 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9032 break;
9033 case PyUnicode_2BYTE_KIND:
9034 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9035 break;
9036 case PyUnicode_4BYTE_KIND:
9037 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9038 break;
9039 default:
9040 assert(0); result = -2;
9041 }
9042 }
9043 else {
9044 switch(kind) {
9045 case PyUnicode_1BYTE_KIND:
9046 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9047 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9048 else
9049 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9050 break;
9051 case PyUnicode_2BYTE_KIND:
9052 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9053 break;
9054 case PyUnicode_4BYTE_KIND:
9055 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9056 break;
9057 default:
9058 assert(0); result = -2;
9059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 }
9061
9062 if (kind1 != kind)
9063 PyMem_Free(buf1);
9064 if (kind2 != kind)
9065 PyMem_Free(buf2);
9066
9067 return result;
9068}
9069
9070Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009071_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 Py_ssize_t n_buffer,
9073 void *digits, Py_ssize_t n_digits,
9074 Py_ssize_t min_width,
9075 const char *grouping,
9076 const char *thousands_sep)
9077{
9078 switch(kind) {
9079 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009080 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9081 return _PyUnicode_ascii_InsertThousandsGrouping(
9082 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9083 min_width, grouping, thousands_sep);
9084 else
9085 return _PyUnicode_ucs1_InsertThousandsGrouping(
9086 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9087 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 case PyUnicode_2BYTE_KIND:
9089 return _PyUnicode_ucs2_InsertThousandsGrouping(
9090 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9091 min_width, grouping, thousands_sep);
9092 case PyUnicode_4BYTE_KIND:
9093 return _PyUnicode_ucs4_InsertThousandsGrouping(
9094 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9095 min_width, grouping, thousands_sep);
9096 }
9097 assert(0);
9098 return -1;
9099}
9100
9101
Thomas Wouters477c8d52006-05-27 19:21:47 +00009102/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009103#define ADJUST_INDICES(start, end, len) \
9104 if (end > len) \
9105 end = len; \
9106 else if (end < 0) { \
9107 end += len; \
9108 if (end < 0) \
9109 end = 0; \
9110 } \
9111 if (start < 0) { \
9112 start += len; \
9113 if (start < 0) \
9114 start = 0; \
9115 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009116
Alexander Belopolsky40018472011-02-26 01:02:56 +00009117Py_ssize_t
9118PyUnicode_Count(PyObject *str,
9119 PyObject *substr,
9120 Py_ssize_t start,
9121 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009122{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009123 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009124 PyUnicodeObject* str_obj;
9125 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 int kind1, kind2, kind;
9127 void *buf1 = NULL, *buf2 = NULL;
9128 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009129
Thomas Wouters477c8d52006-05-27 19:21:47 +00009130 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009133 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009134 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 Py_DECREF(str_obj);
9136 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009137 }
Tim Petersced69f82003-09-16 20:30:58 +00009138
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009139 kind1 = PyUnicode_KIND(str_obj);
9140 kind2 = PyUnicode_KIND(sub_obj);
9141 kind = kind1 > kind2 ? kind1 : kind2;
9142 buf1 = PyUnicode_DATA(str_obj);
9143 if (kind1 != kind)
9144 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
9145 if (!buf1)
9146 goto onError;
9147 buf2 = PyUnicode_DATA(sub_obj);
9148 if (kind2 != kind)
9149 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
9150 if (!buf2)
9151 goto onError;
9152 len1 = PyUnicode_GET_LENGTH(str_obj);
9153 len2 = PyUnicode_GET_LENGTH(sub_obj);
9154
9155 ADJUST_INDICES(start, end, len1);
9156 switch(kind) {
9157 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009158 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9159 result = asciilib_count(
9160 ((Py_UCS1*)buf1) + start, end - start,
9161 buf2, len2, PY_SSIZE_T_MAX
9162 );
9163 else
9164 result = ucs1lib_count(
9165 ((Py_UCS1*)buf1) + start, end - start,
9166 buf2, len2, PY_SSIZE_T_MAX
9167 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 break;
9169 case PyUnicode_2BYTE_KIND:
9170 result = ucs2lib_count(
9171 ((Py_UCS2*)buf1) + start, end - start,
9172 buf2, len2, PY_SSIZE_T_MAX
9173 );
9174 break;
9175 case PyUnicode_4BYTE_KIND:
9176 result = ucs4lib_count(
9177 ((Py_UCS4*)buf1) + start, end - start,
9178 buf2, len2, PY_SSIZE_T_MAX
9179 );
9180 break;
9181 default:
9182 assert(0); result = 0;
9183 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009184
9185 Py_DECREF(sub_obj);
9186 Py_DECREF(str_obj);
9187
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009188 if (kind1 != kind)
9189 PyMem_Free(buf1);
9190 if (kind2 != kind)
9191 PyMem_Free(buf2);
9192
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 onError:
9195 Py_DECREF(sub_obj);
9196 Py_DECREF(str_obj);
9197 if (kind1 != kind && buf1)
9198 PyMem_Free(buf1);
9199 if (kind2 != kind && buf2)
9200 PyMem_Free(buf2);
9201 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202}
9203
Alexander Belopolsky40018472011-02-26 01:02:56 +00009204Py_ssize_t
9205PyUnicode_Find(PyObject *str,
9206 PyObject *sub,
9207 Py_ssize_t start,
9208 Py_ssize_t end,
9209 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009210{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009211 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009212
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009214 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009216 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 Py_DECREF(str);
9219 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 }
Tim Petersced69f82003-09-16 20:30:58 +00009221
Victor Stinner794d5672011-10-10 03:21:36 +02009222 result = any_find_slice(direction,
9223 str, sub, start, end
9224 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009227 Py_DECREF(sub);
9228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 return result;
9230}
9231
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009232Py_ssize_t
9233PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9234 Py_ssize_t start, Py_ssize_t end,
9235 int direction)
9236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009238 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009239 if (PyUnicode_READY(str) == -1)
9240 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009241 if (start < 0 || end < 0) {
9242 PyErr_SetString(PyExc_IndexError, "string index out of range");
9243 return -2;
9244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 if (end > PyUnicode_GET_LENGTH(str))
9246 end = PyUnicode_GET_LENGTH(str);
9247 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009248 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9249 kind, end-start, ch, direction);
9250 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009252 else
9253 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254}
9255
Alexander Belopolsky40018472011-02-26 01:02:56 +00009256static int
9257tailmatch(PyUnicodeObject *self,
9258 PyUnicodeObject *substring,
9259 Py_ssize_t start,
9260 Py_ssize_t end,
9261 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009263 int kind_self;
9264 int kind_sub;
9265 void *data_self;
9266 void *data_sub;
9267 Py_ssize_t offset;
9268 Py_ssize_t i;
9269 Py_ssize_t end_sub;
9270
9271 if (PyUnicode_READY(self) == -1 ||
9272 PyUnicode_READY(substring) == -1)
9273 return 0;
9274
9275 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 return 1;
9277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9279 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009280 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009281 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 kind_self = PyUnicode_KIND(self);
9284 data_self = PyUnicode_DATA(self);
9285 kind_sub = PyUnicode_KIND(substring);
9286 data_sub = PyUnicode_DATA(substring);
9287 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9288
9289 if (direction > 0)
9290 offset = end;
9291 else
9292 offset = start;
9293
9294 if (PyUnicode_READ(kind_self, data_self, offset) ==
9295 PyUnicode_READ(kind_sub, data_sub, 0) &&
9296 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9297 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9298 /* If both are of the same kind, memcmp is sufficient */
9299 if (kind_self == kind_sub) {
9300 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009301 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 data_sub,
9303 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009304 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 }
9306 /* otherwise we have to compare each character by first accesing it */
9307 else {
9308 /* We do not need to compare 0 and len(substring)-1 because
9309 the if statement above ensured already that they are equal
9310 when we end up here. */
9311 // TODO: honor direction and do a forward or backwards search
9312 for (i = 1; i < end_sub; ++i) {
9313 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9314 PyUnicode_READ(kind_sub, data_sub, i))
9315 return 0;
9316 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 }
9320
9321 return 0;
9322}
9323
Alexander Belopolsky40018472011-02-26 01:02:56 +00009324Py_ssize_t
9325PyUnicode_Tailmatch(PyObject *str,
9326 PyObject *substr,
9327 Py_ssize_t start,
9328 Py_ssize_t end,
9329 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009331 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009332
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 str = PyUnicode_FromObject(str);
9334 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009335 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336 substr = PyUnicode_FromObject(substr);
9337 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 Py_DECREF(str);
9339 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009340 }
Tim Petersced69f82003-09-16 20:30:58 +00009341
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 (PyUnicodeObject *)substr,
9344 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 Py_DECREF(str);
9346 Py_DECREF(substr);
9347 return result;
9348}
9349
Guido van Rossumd57fd912000-03-10 22:53:23 +00009350/* Apply fixfct filter to the Unicode object self and return a
9351 reference to the modified object */
9352
Alexander Belopolsky40018472011-02-26 01:02:56 +00009353static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009354fixup(PyObject *self,
9355 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 PyObject *u;
9358 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 if (PyUnicode_READY(self) == -1)
9361 return NULL;
9362 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9363 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9364 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009366 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009368 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009369 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 /* fix functions return the new maximum character in a string,
9372 if the kind of the resulting unicode object does not change,
9373 everything is fine. Otherwise we need to change the string kind
9374 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009375 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376 if (maxchar_new == 0)
9377 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9378 else if (maxchar_new <= 127)
9379 maxchar_new = 127;
9380 else if (maxchar_new <= 255)
9381 maxchar_new = 255;
9382 else if (maxchar_new <= 65535)
9383 maxchar_new = 65535;
9384 else
9385 maxchar_new = 1114111; /* 0x10ffff */
9386
9387 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 /* fixfct should return TRUE if it modified the buffer. If
9389 FALSE, return a reference to the original buffer instead
9390 (to save space, not time) */
9391 Py_INCREF(self);
9392 Py_DECREF(u);
9393 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 else if (maxchar_new == maxchar_old) {
9396 return u;
9397 }
9398 else {
9399 /* In case the maximum character changed, we need to
9400 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009401 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 if (v == NULL) {
9403 Py_DECREF(u);
9404 return NULL;
9405 }
9406 if (maxchar_new > maxchar_old) {
9407 /* If the maxchar increased so that the kind changed, not all
9408 characters are representable anymore and we need to fix the
9409 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009410 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009411 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9413 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009414 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009415 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009416 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417
9418 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009419 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 return v;
9421 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009422}
9423
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009425fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 /* No need to call PyUnicode_READY(self) because this function is only
9428 called as a callback from fixup() which does it already. */
9429 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9430 const int kind = PyUnicode_KIND(self);
9431 void *data = PyUnicode_DATA(self);
9432 int touched = 0;
9433 Py_UCS4 maxchar = 0;
9434 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 for (i = 0; i < len; ++i) {
9437 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9438 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9439 if (up != ch) {
9440 if (up > maxchar)
9441 maxchar = up;
9442 PyUnicode_WRITE(kind, data, i, up);
9443 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009444 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 else if (ch > maxchar)
9446 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009447 }
9448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 if (touched)
9450 return maxchar;
9451 else
9452 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009453}
9454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009456fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9459 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9460 const int kind = PyUnicode_KIND(self);
9461 void *data = PyUnicode_DATA(self);
9462 int touched = 0;
9463 Py_UCS4 maxchar = 0;
9464 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009466 for(i = 0; i < len; ++i) {
9467 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9468 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9469 if (lo != ch) {
9470 if (lo > maxchar)
9471 maxchar = lo;
9472 PyUnicode_WRITE(kind, data, i, lo);
9473 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009475 else if (ch > maxchar)
9476 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477 }
9478
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 if (touched)
9480 return maxchar;
9481 else
9482 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009483}
9484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009485static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009486fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9489 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9490 const int kind = PyUnicode_KIND(self);
9491 void *data = PyUnicode_DATA(self);
9492 int touched = 0;
9493 Py_UCS4 maxchar = 0;
9494 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009496 for(i = 0; i < len; ++i) {
9497 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9498 Py_UCS4 nu = 0;
9499
9500 if (Py_UNICODE_ISUPPER(ch))
9501 nu = Py_UNICODE_TOLOWER(ch);
9502 else if (Py_UNICODE_ISLOWER(ch))
9503 nu = Py_UNICODE_TOUPPER(ch);
9504
9505 if (nu != 0) {
9506 if (nu > maxchar)
9507 maxchar = nu;
9508 PyUnicode_WRITE(kind, data, i, nu);
9509 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 else if (ch > maxchar)
9512 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 }
9514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 if (touched)
9516 return maxchar;
9517 else
9518 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009519}
9520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009522fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9525 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9526 const int kind = PyUnicode_KIND(self);
9527 void *data = PyUnicode_DATA(self);
9528 int touched = 0;
9529 Py_UCS4 maxchar = 0;
9530 Py_ssize_t i = 0;
9531 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009532
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009533 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009534 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009535
9536 ch = PyUnicode_READ(kind, data, i);
9537 if (!Py_UNICODE_ISUPPER(ch)) {
9538 maxchar = Py_UNICODE_TOUPPER(ch);
9539 PyUnicode_WRITE(kind, data, i, maxchar);
9540 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009542 ++i;
9543 for(; i < len; ++i) {
9544 ch = PyUnicode_READ(kind, data, i);
9545 if (!Py_UNICODE_ISLOWER(ch)) {
9546 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9547 if (lo > maxchar)
9548 maxchar = lo;
9549 PyUnicode_WRITE(kind, data, i, lo);
9550 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 else if (ch > maxchar)
9553 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555
9556 if (touched)
9557 return maxchar;
9558 else
9559 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009560}
9561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009563fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9566 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9567 const int kind = PyUnicode_KIND(self);
9568 void *data = PyUnicode_DATA(self);
9569 Py_UCS4 maxchar = 0;
9570 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 int previous_is_cased;
9572
9573 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 if (len == 1) {
9575 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9576 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9577 if (ti != ch) {
9578 PyUnicode_WRITE(kind, data, i, ti);
9579 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009580 }
9581 else
9582 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585 for(; i < len; ++i) {
9586 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9587 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009588
Benjamin Peterson29060642009-01-31 22:14:21 +00009589 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009591 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 nu = Py_UNICODE_TOTITLE(ch);
9593
9594 if (nu > maxchar)
9595 maxchar = nu;
9596 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009597
Benjamin Peterson29060642009-01-31 22:14:21 +00009598 if (Py_UNICODE_ISLOWER(ch) ||
9599 Py_UNICODE_ISUPPER(ch) ||
9600 Py_UNICODE_ISTITLE(ch))
9601 previous_is_cased = 1;
9602 else
9603 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009604 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009606}
9607
Tim Peters8ce9f162004-08-27 01:49:32 +00009608PyObject *
9609PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009610{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009612 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009614 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009615 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9616 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009617 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009619 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009621 int use_memcpy;
9622 unsigned char *res_data = NULL, *sep_data = NULL;
9623 PyObject *last_obj;
9624 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009625
Tim Peters05eba1f2004-08-27 21:32:02 +00009626 fseq = PySequence_Fast(seq, "");
9627 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009628 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009629 }
9630
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009631 /* NOTE: the following code can't call back into Python code,
9632 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009633 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009634
Tim Peters05eba1f2004-08-27 21:32:02 +00009635 seqlen = PySequence_Fast_GET_SIZE(fseq);
9636 /* If empty sequence, return u"". */
9637 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009638 Py_DECREF(fseq);
9639 Py_INCREF(unicode_empty);
9640 res = unicode_empty;
9641 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009642 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009643
Tim Peters05eba1f2004-08-27 21:32:02 +00009644 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009645 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009646 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009647 if (seqlen == 1) {
9648 if (PyUnicode_CheckExact(items[0])) {
9649 res = items[0];
9650 Py_INCREF(res);
9651 Py_DECREF(fseq);
9652 return res;
9653 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009654 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009655 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009656 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009657 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009658 /* Set up sep and seplen */
9659 if (separator == NULL) {
9660 /* fall back to a blank space separator */
9661 sep = PyUnicode_FromOrdinal(' ');
9662 if (!sep)
9663 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009664 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009665 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009666 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009667 else {
9668 if (!PyUnicode_Check(separator)) {
9669 PyErr_Format(PyExc_TypeError,
9670 "separator: expected str instance,"
9671 " %.80s found",
9672 Py_TYPE(separator)->tp_name);
9673 goto onError;
9674 }
9675 if (PyUnicode_READY(separator))
9676 goto onError;
9677 sep = separator;
9678 seplen = PyUnicode_GET_LENGTH(separator);
9679 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9680 /* inc refcount to keep this code path symmetric with the
9681 above case of a blank separator */
9682 Py_INCREF(sep);
9683 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009684 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009685 }
9686
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009687 /* There are at least two things to join, or else we have a subclass
9688 * of str in the sequence.
9689 * Do a pre-pass to figure out the total amount of space we'll
9690 * need (sz), and see whether all argument are strings.
9691 */
9692 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009693#ifdef Py_DEBUG
9694 use_memcpy = 0;
9695#else
9696 use_memcpy = 1;
9697#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009698 for (i = 0; i < seqlen; i++) {
9699 const Py_ssize_t old_sz = sz;
9700 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009701 if (!PyUnicode_Check(item)) {
9702 PyErr_Format(PyExc_TypeError,
9703 "sequence item %zd: expected str instance,"
9704 " %.80s found",
9705 i, Py_TYPE(item)->tp_name);
9706 goto onError;
9707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 if (PyUnicode_READY(item) == -1)
9709 goto onError;
9710 sz += PyUnicode_GET_LENGTH(item);
9711 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009712 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009713 if (i != 0)
9714 sz += seplen;
9715 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9716 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009717 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009718 goto onError;
9719 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009720 if (use_memcpy && last_obj != NULL) {
9721 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9722 use_memcpy = 0;
9723 }
9724 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009725 }
Tim Petersced69f82003-09-16 20:30:58 +00009726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009728 if (res == NULL)
9729 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009730
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009732#ifdef Py_DEBUG
9733 use_memcpy = 0;
9734#else
9735 if (use_memcpy) {
9736 res_data = PyUnicode_1BYTE_DATA(res);
9737 kind = PyUnicode_KIND(res);
9738 if (seplen != 0)
9739 sep_data = PyUnicode_1BYTE_DATA(sep);
9740 }
9741#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009742 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009743 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009744 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009746 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009747 if (use_memcpy) {
9748 Py_MEMCPY(res_data,
9749 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009750 kind * seplen);
9751 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009752 }
9753 else {
9754 copy_characters(res, res_offset, sep, 0, seplen);
9755 res_offset += seplen;
9756 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009757 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009758 itemlen = PyUnicode_GET_LENGTH(item);
9759 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009760 if (use_memcpy) {
9761 Py_MEMCPY(res_data,
9762 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009763 kind * itemlen);
9764 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009765 }
9766 else {
9767 copy_characters(res, res_offset, item, 0, itemlen);
9768 res_offset += itemlen;
9769 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009770 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009771 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009772 if (use_memcpy)
9773 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009774 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009775 else
9776 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009777
Tim Peters05eba1f2004-08-27 21:32:02 +00009778 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009779 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009780 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009781 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782
Benjamin Peterson29060642009-01-31 22:14:21 +00009783 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009784 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009786 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787 return NULL;
9788}
9789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790#define FILL(kind, data, value, start, length) \
9791 do { \
9792 Py_ssize_t i_ = 0; \
9793 assert(kind != PyUnicode_WCHAR_KIND); \
9794 switch ((kind)) { \
9795 case PyUnicode_1BYTE_KIND: { \
9796 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9797 memset(to_, (unsigned char)value, length); \
9798 break; \
9799 } \
9800 case PyUnicode_2BYTE_KIND: { \
9801 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9802 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9803 break; \
9804 } \
9805 default: { \
9806 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9807 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9808 break; \
9809 } \
9810 } \
9811 } while (0)
9812
Victor Stinner9310abb2011-10-05 00:59:23 +02009813static PyObject *
9814pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009815 Py_ssize_t left,
9816 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009817 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009819 PyObject *u;
9820 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009821 int kind;
9822 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009823
9824 if (left < 0)
9825 left = 0;
9826 if (right < 0)
9827 right = 0;
9828
Tim Peters7a29bd52001-09-12 03:03:31 +00009829 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830 Py_INCREF(self);
9831 return self;
9832 }
9833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009834 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9835 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009836 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9837 return NULL;
9838 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009839 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9840 if (fill > maxchar)
9841 maxchar = fill;
9842 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009843 if (!u)
9844 return NULL;
9845
9846 kind = PyUnicode_KIND(u);
9847 data = PyUnicode_DATA(u);
9848 if (left)
9849 FILL(kind, data, fill, 0, left);
9850 if (right)
9851 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009852 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009853 assert(_PyUnicode_CheckConsistency(u, 1));
9854 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857
Alexander Belopolsky40018472011-02-26 01:02:56 +00009858PyObject *
9859PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009862
9863 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009865 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 switch(PyUnicode_KIND(string)) {
9868 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009869 if (PyUnicode_IS_ASCII(string))
9870 list = asciilib_splitlines(
9871 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9872 PyUnicode_GET_LENGTH(string), keepends);
9873 else
9874 list = ucs1lib_splitlines(
9875 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9876 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 break;
9878 case PyUnicode_2BYTE_KIND:
9879 list = ucs2lib_splitlines(
9880 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9881 PyUnicode_GET_LENGTH(string), keepends);
9882 break;
9883 case PyUnicode_4BYTE_KIND:
9884 list = ucs4lib_splitlines(
9885 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9886 PyUnicode_GET_LENGTH(string), keepends);
9887 break;
9888 default:
9889 assert(0);
9890 list = 0;
9891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009892 Py_DECREF(string);
9893 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894}
9895
Alexander Belopolsky40018472011-02-26 01:02:56 +00009896static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009897split(PyObject *self,
9898 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009899 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009900{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 int kind1, kind2, kind;
9902 void *buf1, *buf2;
9903 Py_ssize_t len1, len2;
9904 PyObject* out;
9905
Guido van Rossumd57fd912000-03-10 22:53:23 +00009906 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009907 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009909 if (PyUnicode_READY(self) == -1)
9910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 if (substring == NULL)
9913 switch(PyUnicode_KIND(self)) {
9914 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009915 if (PyUnicode_IS_ASCII(self))
9916 return asciilib_split_whitespace(
9917 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9918 PyUnicode_GET_LENGTH(self), maxcount
9919 );
9920 else
9921 return ucs1lib_split_whitespace(
9922 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9923 PyUnicode_GET_LENGTH(self), maxcount
9924 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 case PyUnicode_2BYTE_KIND:
9926 return ucs2lib_split_whitespace(
9927 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9928 PyUnicode_GET_LENGTH(self), maxcount
9929 );
9930 case PyUnicode_4BYTE_KIND:
9931 return ucs4lib_split_whitespace(
9932 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9933 PyUnicode_GET_LENGTH(self), maxcount
9934 );
9935 default:
9936 assert(0);
9937 return NULL;
9938 }
9939
9940 if (PyUnicode_READY(substring) == -1)
9941 return NULL;
9942
9943 kind1 = PyUnicode_KIND(self);
9944 kind2 = PyUnicode_KIND(substring);
9945 kind = kind1 > kind2 ? kind1 : kind2;
9946 buf1 = PyUnicode_DATA(self);
9947 buf2 = PyUnicode_DATA(substring);
9948 if (kind1 != kind)
9949 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9950 if (!buf1)
9951 return NULL;
9952 if (kind2 != kind)
9953 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9954 if (!buf2) {
9955 if (kind1 != kind) PyMem_Free(buf1);
9956 return NULL;
9957 }
9958 len1 = PyUnicode_GET_LENGTH(self);
9959 len2 = PyUnicode_GET_LENGTH(substring);
9960
9961 switch(kind) {
9962 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009963 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9964 out = asciilib_split(
9965 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9966 else
9967 out = ucs1lib_split(
9968 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009969 break;
9970 case PyUnicode_2BYTE_KIND:
9971 out = ucs2lib_split(
9972 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9973 break;
9974 case PyUnicode_4BYTE_KIND:
9975 out = ucs4lib_split(
9976 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9977 break;
9978 default:
9979 out = NULL;
9980 }
9981 if (kind1 != kind)
9982 PyMem_Free(buf1);
9983 if (kind2 != kind)
9984 PyMem_Free(buf2);
9985 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986}
9987
Alexander Belopolsky40018472011-02-26 01:02:56 +00009988static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009989rsplit(PyObject *self,
9990 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009991 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 int kind1, kind2, kind;
9994 void *buf1, *buf2;
9995 Py_ssize_t len1, len2;
9996 PyObject* out;
9997
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009998 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009999 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (PyUnicode_READY(self) == -1)
10002 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (substring == NULL)
10005 switch(PyUnicode_KIND(self)) {
10006 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010007 if (PyUnicode_IS_ASCII(self))
10008 return asciilib_rsplit_whitespace(
10009 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10010 PyUnicode_GET_LENGTH(self), maxcount
10011 );
10012 else
10013 return ucs1lib_rsplit_whitespace(
10014 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10015 PyUnicode_GET_LENGTH(self), maxcount
10016 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 case PyUnicode_2BYTE_KIND:
10018 return ucs2lib_rsplit_whitespace(
10019 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
10020 PyUnicode_GET_LENGTH(self), maxcount
10021 );
10022 case PyUnicode_4BYTE_KIND:
10023 return ucs4lib_rsplit_whitespace(
10024 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
10025 PyUnicode_GET_LENGTH(self), maxcount
10026 );
10027 default:
10028 assert(0);
10029 return NULL;
10030 }
10031
10032 if (PyUnicode_READY(substring) == -1)
10033 return NULL;
10034
10035 kind1 = PyUnicode_KIND(self);
10036 kind2 = PyUnicode_KIND(substring);
10037 kind = kind1 > kind2 ? kind1 : kind2;
10038 buf1 = PyUnicode_DATA(self);
10039 buf2 = PyUnicode_DATA(substring);
10040 if (kind1 != kind)
10041 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10042 if (!buf1)
10043 return NULL;
10044 if (kind2 != kind)
10045 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10046 if (!buf2) {
10047 if (kind1 != kind) PyMem_Free(buf1);
10048 return NULL;
10049 }
10050 len1 = PyUnicode_GET_LENGTH(self);
10051 len2 = PyUnicode_GET_LENGTH(substring);
10052
10053 switch(kind) {
10054 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010055 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10056 out = asciilib_rsplit(
10057 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10058 else
10059 out = ucs1lib_rsplit(
10060 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 break;
10062 case PyUnicode_2BYTE_KIND:
10063 out = ucs2lib_rsplit(
10064 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10065 break;
10066 case PyUnicode_4BYTE_KIND:
10067 out = ucs4lib_rsplit(
10068 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10069 break;
10070 default:
10071 out = NULL;
10072 }
10073 if (kind1 != kind)
10074 PyMem_Free(buf1);
10075 if (kind2 != kind)
10076 PyMem_Free(buf2);
10077 return out;
10078}
10079
10080static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010081anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10082 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083{
10084 switch(kind) {
10085 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010086 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10087 return asciilib_find(buf1, len1, buf2, len2, offset);
10088 else
10089 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 case PyUnicode_2BYTE_KIND:
10091 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10092 case PyUnicode_4BYTE_KIND:
10093 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10094 }
10095 assert(0);
10096 return -1;
10097}
10098
10099static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010100anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10101 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102{
10103 switch(kind) {
10104 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010105 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10106 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10107 else
10108 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 case PyUnicode_2BYTE_KIND:
10110 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10111 case PyUnicode_4BYTE_KIND:
10112 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10113 }
10114 assert(0);
10115 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010116}
10117
Alexander Belopolsky40018472011-02-26 01:02:56 +000010118static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119replace(PyObject *self, PyObject *str1,
10120 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010121{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 PyObject *u;
10123 char *sbuf = PyUnicode_DATA(self);
10124 char *buf1 = PyUnicode_DATA(str1);
10125 char *buf2 = PyUnicode_DATA(str2);
10126 int srelease = 0, release1 = 0, release2 = 0;
10127 int skind = PyUnicode_KIND(self);
10128 int kind1 = PyUnicode_KIND(str1);
10129 int kind2 = PyUnicode_KIND(str2);
10130 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10131 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10132 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010133 int mayshrink;
10134 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135
10136 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010137 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010138 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010139 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
Victor Stinner59de0ee2011-10-07 10:01:28 +020010141 if (str1 == str2)
10142 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 if (skind < kind1)
10144 /* substring too wide to be present */
10145 goto nothing;
10146
Victor Stinner49a0a212011-10-12 23:46:10 +020010147 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10148 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10149 /* Replacing str1 with str2 may cause a maxchar reduction in the
10150 result string. */
10151 mayshrink = (maxchar_str2 < maxchar);
10152 maxchar = Py_MAX(maxchar, maxchar_str2);
10153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010154 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010155 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010156 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010158 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010159 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010160 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010161 Py_UCS4 u1, u2;
10162 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010164 if (findchar(sbuf, PyUnicode_KIND(self),
10165 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010166 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010171 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010172 rkind = PyUnicode_KIND(u);
10173 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10174 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010175 if (--maxcount < 0)
10176 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010178 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010179 }
10180 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 int rkind = skind;
10182 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 if (kind1 < rkind) {
10185 /* widen substring */
10186 buf1 = _PyUnicode_AsKind(str1, rkind);
10187 if (!buf1) goto error;
10188 release1 = 1;
10189 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010190 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010191 if (i < 0)
10192 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 if (rkind > kind2) {
10194 /* widen replacement */
10195 buf2 = _PyUnicode_AsKind(str2, rkind);
10196 if (!buf2) goto error;
10197 release2 = 1;
10198 }
10199 else if (rkind < kind2) {
10200 /* widen self and buf1 */
10201 rkind = kind2;
10202 if (release1) PyMem_Free(buf1);
10203 sbuf = _PyUnicode_AsKind(self, rkind);
10204 if (!sbuf) goto error;
10205 srelease = 1;
10206 buf1 = _PyUnicode_AsKind(str1, rkind);
10207 if (!buf1) goto error;
10208 release1 = 1;
10209 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010210 u = PyUnicode_New(slen, maxchar);
10211 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010213 assert(PyUnicode_KIND(u) == rkind);
10214 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010215
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010216 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010217 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010218 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010220 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010222
10223 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010225 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010226 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010227 if (i == -1)
10228 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010229 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010231 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010235 }
10236 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010237 Py_ssize_t n, i, j, ires;
10238 Py_ssize_t product, new_size;
10239 int rkind = skind;
10240 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010243 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 buf1 = _PyUnicode_AsKind(str1, rkind);
10245 if (!buf1) goto error;
10246 release1 = 1;
10247 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010248 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010249 if (n == 0)
10250 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010252 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 buf2 = _PyUnicode_AsKind(str2, rkind);
10254 if (!buf2) goto error;
10255 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010257 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010258 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 rkind = kind2;
10260 sbuf = _PyUnicode_AsKind(self, rkind);
10261 if (!sbuf) goto error;
10262 srelease = 1;
10263 if (release1) PyMem_Free(buf1);
10264 buf1 = _PyUnicode_AsKind(str1, rkind);
10265 if (!buf1) goto error;
10266 release1 = 1;
10267 }
10268 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10269 PyUnicode_GET_LENGTH(str1))); */
10270 product = n * (len2-len1);
10271 if ((product / (len2-len1)) != n) {
10272 PyErr_SetString(PyExc_OverflowError,
10273 "replace string is too long");
10274 goto error;
10275 }
10276 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010277 if (new_size == 0) {
10278 Py_INCREF(unicode_empty);
10279 u = unicode_empty;
10280 goto done;
10281 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10283 PyErr_SetString(PyExc_OverflowError,
10284 "replace string is too long");
10285 goto error;
10286 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010287 u = PyUnicode_New(new_size, maxchar);
10288 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010289 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010290 assert(PyUnicode_KIND(u) == rkind);
10291 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 ires = i = 0;
10293 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 while (n-- > 0) {
10295 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010297 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010298 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010299 if (j == -1)
10300 break;
10301 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010302 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010303 memcpy(res + rkind * ires,
10304 sbuf + rkind * i,
10305 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010307 }
10308 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010310 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010312 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010314 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010315 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010318 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010319 memcpy(res + rkind * ires,
10320 sbuf + rkind * i,
10321 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010322 }
10323 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010324 /* interleave */
10325 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010326 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010328 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010330 if (--n <= 0)
10331 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010332 memcpy(res + rkind * ires,
10333 sbuf + rkind * i,
10334 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 ires++;
10336 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010337 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010338 memcpy(res + rkind * ires,
10339 sbuf + rkind * i,
10340 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010342 }
10343
10344 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010345 unicode_adjust_maxchar(&u);
10346 if (u == NULL)
10347 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010349
10350 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 if (srelease)
10352 PyMem_FREE(sbuf);
10353 if (release1)
10354 PyMem_FREE(buf1);
10355 if (release2)
10356 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010357 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010359
Benjamin Peterson29060642009-01-31 22:14:21 +000010360 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010361 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 if (srelease)
10363 PyMem_FREE(sbuf);
10364 if (release1)
10365 PyMem_FREE(buf1);
10366 if (release2)
10367 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010368 if (PyUnicode_CheckExact(self)) {
10369 Py_INCREF(self);
10370 return (PyObject *) self;
10371 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010372 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 error:
10374 if (srelease && sbuf)
10375 PyMem_FREE(sbuf);
10376 if (release1 && buf1)
10377 PyMem_FREE(buf1);
10378 if (release2 && buf2)
10379 PyMem_FREE(buf2);
10380 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381}
10382
10383/* --- Unicode Object Methods --------------------------------------------- */
10384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010385PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010386 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010387\n\
10388Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010389characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390
10391static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010392unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010394 return fixup(self, fixtitle);
10395}
10396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010397PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010398 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010399\n\
10400Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010401have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402
10403static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010404unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406 return fixup(self, fixcapitalize);
10407}
10408
10409#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010410PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010411 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010412\n\
10413Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010414normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415
10416static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010417unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418{
10419 PyObject *list;
10420 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010421 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422
Guido van Rossumd57fd912000-03-10 22:53:23 +000010423 /* Split into words */
10424 list = split(self, NULL, -1);
10425 if (!list)
10426 return NULL;
10427
10428 /* Capitalize each word */
10429 for (i = 0; i < PyList_GET_SIZE(list); i++) {
10430 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010431 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010432 if (item == NULL)
10433 goto onError;
10434 Py_DECREF(PyList_GET_ITEM(list, i));
10435 PyList_SET_ITEM(list, i, item);
10436 }
10437
10438 /* Join the words to form a new string */
10439 item = PyUnicode_Join(NULL, list);
10440
Benjamin Peterson29060642009-01-31 22:14:21 +000010441 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010442 Py_DECREF(list);
10443 return (PyObject *)item;
10444}
10445#endif
10446
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010447/* Argument converter. Coerces to a single unicode character */
10448
10449static int
10450convert_uc(PyObject *obj, void *addr)
10451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010453 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010454
Benjamin Peterson14339b62009-01-31 16:36:08 +000010455 uniobj = PyUnicode_FromObject(obj);
10456 if (uniobj == NULL) {
10457 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010458 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010459 return 0;
10460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010464 Py_DECREF(uniobj);
10465 return 0;
10466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010468 Py_DECREF(uniobj);
10469 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010470}
10471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010472PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010473 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010475Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010476done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
10478static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010479unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010481 Py_ssize_t marg, left;
10482 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 Py_UCS4 fillchar = ' ';
10484
Victor Stinnere9a29352011-10-01 02:14:59 +020010485 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010487
Victor Stinnere9a29352011-10-01 02:14:59 +020010488 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489 return NULL;
10490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 Py_INCREF(self);
10493 return (PyObject*) self;
10494 }
10495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 left = marg / 2 + (marg & width & 1);
10498
Victor Stinner9310abb2011-10-05 00:59:23 +020010499 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500}
10501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010502/* This function assumes that str1 and str2 are readied by the caller. */
10503
Marc-André Lemburge5034372000-08-08 08:04:29 +000010504static int
10505unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10506{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 int kind1, kind2;
10508 void *data1, *data2;
10509 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 kind1 = PyUnicode_KIND(str1);
10512 kind2 = PyUnicode_KIND(str2);
10513 data1 = PyUnicode_DATA(str1);
10514 data2 = PyUnicode_DATA(str2);
10515 len1 = PyUnicode_GET_LENGTH(str1);
10516 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 for (i = 0; i < len1 && i < len2; ++i) {
10519 Py_UCS4 c1, c2;
10520 c1 = PyUnicode_READ(kind1, data1, i);
10521 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010522
10523 if (c1 != c2)
10524 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010525 }
10526
10527 return (len1 < len2) ? -1 : (len1 != len2);
10528}
10529
Alexander Belopolsky40018472011-02-26 01:02:56 +000010530int
10531PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10534 if (PyUnicode_READY(left) == -1 ||
10535 PyUnicode_READY(right) == -1)
10536 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010537 return unicode_compare((PyUnicodeObject *)left,
10538 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010540 PyErr_Format(PyExc_TypeError,
10541 "Can't compare %.100s and %.100s",
10542 left->ob_type->tp_name,
10543 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544 return -1;
10545}
10546
Martin v. Löwis5b222132007-06-10 09:51:05 +000010547int
10548PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010550 Py_ssize_t i;
10551 int kind;
10552 void *data;
10553 Py_UCS4 chr;
10554
Victor Stinner910337b2011-10-03 03:20:16 +020010555 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010556 if (PyUnicode_READY(uni) == -1)
10557 return -1;
10558 kind = PyUnicode_KIND(uni);
10559 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010560 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10562 if (chr != str[i])
10563 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010564 /* This check keeps Python strings that end in '\0' from comparing equal
10565 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010567 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010568 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010570 return 0;
10571}
10572
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010573
Benjamin Peterson29060642009-01-31 22:14:21 +000010574#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010575 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010576
Alexander Belopolsky40018472011-02-26 01:02:56 +000010577PyObject *
10578PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010579{
10580 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010581
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010582 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10583 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 if (PyUnicode_READY(left) == -1 ||
10585 PyUnicode_READY(right) == -1)
10586 return NULL;
10587 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10588 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010589 if (op == Py_EQ) {
10590 Py_INCREF(Py_False);
10591 return Py_False;
10592 }
10593 if (op == Py_NE) {
10594 Py_INCREF(Py_True);
10595 return Py_True;
10596 }
10597 }
10598 if (left == right)
10599 result = 0;
10600 else
10601 result = unicode_compare((PyUnicodeObject *)left,
10602 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010603
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010604 /* Convert the return value to a Boolean */
10605 switch (op) {
10606 case Py_EQ:
10607 v = TEST_COND(result == 0);
10608 break;
10609 case Py_NE:
10610 v = TEST_COND(result != 0);
10611 break;
10612 case Py_LE:
10613 v = TEST_COND(result <= 0);
10614 break;
10615 case Py_GE:
10616 v = TEST_COND(result >= 0);
10617 break;
10618 case Py_LT:
10619 v = TEST_COND(result == -1);
10620 break;
10621 case Py_GT:
10622 v = TEST_COND(result == 1);
10623 break;
10624 default:
10625 PyErr_BadArgument();
10626 return NULL;
10627 }
10628 Py_INCREF(v);
10629 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010630 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010631
Brian Curtindfc80e32011-08-10 20:28:54 -050010632 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010633}
10634
Alexander Belopolsky40018472011-02-26 01:02:56 +000010635int
10636PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010637{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 int kind1, kind2, kind;
10640 void *buf1, *buf2;
10641 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010642 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010643
10644 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010645 sub = PyUnicode_FromObject(element);
10646 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010647 PyErr_Format(PyExc_TypeError,
10648 "'in <string>' requires string as left operand, not %s",
10649 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010651 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 if (PyUnicode_READY(sub) == -1)
10653 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010654
Thomas Wouters477c8d52006-05-27 19:21:47 +000010655 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010656 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 Py_DECREF(sub);
10658 return -1;
10659 }
10660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 kind1 = PyUnicode_KIND(str);
10662 kind2 = PyUnicode_KIND(sub);
10663 kind = kind1 > kind2 ? kind1 : kind2;
10664 buf1 = PyUnicode_DATA(str);
10665 buf2 = PyUnicode_DATA(sub);
10666 if (kind1 != kind)
10667 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10668 if (!buf1) {
10669 Py_DECREF(sub);
10670 return -1;
10671 }
10672 if (kind2 != kind)
10673 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10674 if (!buf2) {
10675 Py_DECREF(sub);
10676 if (kind1 != kind) PyMem_Free(buf1);
10677 return -1;
10678 }
10679 len1 = PyUnicode_GET_LENGTH(str);
10680 len2 = PyUnicode_GET_LENGTH(sub);
10681
10682 switch(kind) {
10683 case PyUnicode_1BYTE_KIND:
10684 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10685 break;
10686 case PyUnicode_2BYTE_KIND:
10687 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10688 break;
10689 case PyUnicode_4BYTE_KIND:
10690 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10691 break;
10692 default:
10693 result = -1;
10694 assert(0);
10695 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010696
10697 Py_DECREF(str);
10698 Py_DECREF(sub);
10699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 if (kind1 != kind)
10701 PyMem_Free(buf1);
10702 if (kind2 != kind)
10703 PyMem_Free(buf2);
10704
Guido van Rossum403d68b2000-03-13 15:55:09 +000010705 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010706}
10707
Guido van Rossumd57fd912000-03-10 22:53:23 +000010708/* Concat to string or Unicode object giving a new Unicode object. */
10709
Alexander Belopolsky40018472011-02-26 01:02:56 +000010710PyObject *
10711PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010714 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715
10716 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010719 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010723
10724 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010725 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010726 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010729 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010730 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010732 }
10733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010735 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10736 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 w = PyUnicode_New(
10740 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10741 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010742 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010743 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010744 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10745 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746 Py_DECREF(u);
10747 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010748 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750
Benjamin Peterson29060642009-01-31 22:14:21 +000010751 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752 Py_XDECREF(u);
10753 Py_XDECREF(v);
10754 return NULL;
10755}
10756
Victor Stinnerb0923652011-10-04 01:17:31 +020010757static void
10758unicode_append_inplace(PyObject **p_left, PyObject *right)
10759{
10760 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010761
10762 assert(PyUnicode_IS_READY(*p_left));
10763 assert(PyUnicode_IS_READY(right));
10764
10765 left_len = PyUnicode_GET_LENGTH(*p_left);
10766 right_len = PyUnicode_GET_LENGTH(right);
10767 if (left_len > PY_SSIZE_T_MAX - right_len) {
10768 PyErr_SetString(PyExc_OverflowError,
10769 "strings are too large to concat");
10770 goto error;
10771 }
10772 new_len = left_len + right_len;
10773
10774 /* Now we own the last reference to 'left', so we can resize it
10775 * in-place.
10776 */
10777 if (unicode_resize(p_left, new_len) != 0) {
10778 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10779 * deallocated so it cannot be put back into
10780 * 'variable'. The MemoryError is raised when there
10781 * is no value in 'variable', which might (very
10782 * remotely) be a cause of incompatibilities.
10783 */
10784 goto error;
10785 }
10786 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010787 copy_characters(*p_left, left_len, right, 0, right_len);
10788 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010789 return;
10790
10791error:
10792 Py_DECREF(*p_left);
10793 *p_left = NULL;
10794}
10795
Walter Dörwald1ab83302007-05-18 17:15:44 +000010796void
Victor Stinner23e56682011-10-03 03:54:37 +020010797PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010798{
Victor Stinner23e56682011-10-03 03:54:37 +020010799 PyObject *left, *res;
10800
10801 if (p_left == NULL) {
10802 if (!PyErr_Occurred())
10803 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010804 return;
10805 }
Victor Stinner23e56682011-10-03 03:54:37 +020010806 left = *p_left;
10807 if (right == NULL || !PyUnicode_Check(left)) {
10808 if (!PyErr_Occurred())
10809 PyErr_BadInternalCall();
10810 goto error;
10811 }
10812
Victor Stinnere1335c72011-10-04 20:53:03 +020010813 if (PyUnicode_READY(left))
10814 goto error;
10815 if (PyUnicode_READY(right))
10816 goto error;
10817
Victor Stinner23e56682011-10-03 03:54:37 +020010818 if (PyUnicode_CheckExact(left) && left != unicode_empty
10819 && PyUnicode_CheckExact(right) && right != unicode_empty
10820 && unicode_resizable(left)
10821 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10822 || _PyUnicode_WSTR(left) != NULL))
10823 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010824 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10825 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010826 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010827 not so different than duplicating the string. */
10828 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010829 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010830 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010831 if (p_left != NULL)
10832 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010833 return;
10834 }
10835 }
10836
10837 res = PyUnicode_Concat(left, right);
10838 if (res == NULL)
10839 goto error;
10840 Py_DECREF(left);
10841 *p_left = res;
10842 return;
10843
10844error:
10845 Py_DECREF(*p_left);
10846 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010847}
10848
10849void
10850PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10851{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010852 PyUnicode_Append(pleft, right);
10853 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010854}
10855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010856PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010857 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010859Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010860string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010861interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862
10863static PyObject *
10864unicode_count(PyUnicodeObject *self, PyObject *args)
10865{
10866 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010867 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010868 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010869 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010870 int kind1, kind2, kind;
10871 void *buf1, *buf2;
10872 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873
Jesus Ceaac451502011-04-20 17:09:23 +020010874 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10875 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010876 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 kind1 = PyUnicode_KIND(self);
10879 kind2 = PyUnicode_KIND(substring);
10880 kind = kind1 > kind2 ? kind1 : kind2;
10881 buf1 = PyUnicode_DATA(self);
10882 buf2 = PyUnicode_DATA(substring);
10883 if (kind1 != kind)
10884 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10885 if (!buf1) {
10886 Py_DECREF(substring);
10887 return NULL;
10888 }
10889 if (kind2 != kind)
10890 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10891 if (!buf2) {
10892 Py_DECREF(substring);
10893 if (kind1 != kind) PyMem_Free(buf1);
10894 return NULL;
10895 }
10896 len1 = PyUnicode_GET_LENGTH(self);
10897 len2 = PyUnicode_GET_LENGTH(substring);
10898
10899 ADJUST_INDICES(start, end, len1);
10900 switch(kind) {
10901 case PyUnicode_1BYTE_KIND:
10902 iresult = ucs1lib_count(
10903 ((Py_UCS1*)buf1) + start, end - start,
10904 buf2, len2, PY_SSIZE_T_MAX
10905 );
10906 break;
10907 case PyUnicode_2BYTE_KIND:
10908 iresult = ucs2lib_count(
10909 ((Py_UCS2*)buf1) + start, end - start,
10910 buf2, len2, PY_SSIZE_T_MAX
10911 );
10912 break;
10913 case PyUnicode_4BYTE_KIND:
10914 iresult = ucs4lib_count(
10915 ((Py_UCS4*)buf1) + start, end - start,
10916 buf2, len2, PY_SSIZE_T_MAX
10917 );
10918 break;
10919 default:
10920 assert(0); iresult = 0;
10921 }
10922
10923 result = PyLong_FromSsize_t(iresult);
10924
10925 if (kind1 != kind)
10926 PyMem_Free(buf1);
10927 if (kind2 != kind)
10928 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
10930 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932 return result;
10933}
10934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010935PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010936 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010938Encode S using the codec registered for encoding. Default encoding\n\
10939is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010940handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010941a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10942'xmlcharrefreplace' as well as any other name registered with\n\
10943codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010946unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010948 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010949 char *encoding = NULL;
10950 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010951
Benjamin Peterson308d6372009-09-18 21:42:35 +000010952 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10953 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010954 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010955 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010956}
10957
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010958PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010959 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960\n\
10961Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010962If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963
10964static PyObject*
10965unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10966{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010967 Py_ssize_t i, j, line_pos, src_len, incr;
10968 Py_UCS4 ch;
10969 PyObject *u;
10970 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010972 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010973 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
10975 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
Antoine Pitrou22425222011-10-04 19:10:51 +020010978 if (PyUnicode_READY(self) == -1)
10979 return NULL;
10980
Thomas Wouters7e474022000-07-16 12:04:32 +000010981 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010982 src_len = PyUnicode_GET_LENGTH(self);
10983 i = j = line_pos = 0;
10984 kind = PyUnicode_KIND(self);
10985 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010986 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010987 for (; i < src_len; i++) {
10988 ch = PyUnicode_READ(kind, src_data, i);
10989 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010990 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010991 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010992 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010993 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010994 goto overflow;
10995 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010997 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011001 goto overflow;
11002 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011003 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011004 if (ch == '\n' || ch == '\r')
11005 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011008 if (!found && PyUnicode_CheckExact(self)) {
11009 Py_INCREF((PyObject *) self);
11010 return (PyObject *) self;
11011 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011012
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011014 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 if (!u)
11016 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011017 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018
Antoine Pitroue71d5742011-10-04 15:55:09 +020011019 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020
Antoine Pitroue71d5742011-10-04 15:55:09 +020011021 for (; i < src_len; i++) {
11022 ch = PyUnicode_READ(kind, src_data, i);
11023 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011024 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011025 incr = tabsize - (line_pos % tabsize);
11026 line_pos += incr;
11027 while (incr--) {
11028 PyUnicode_WRITE(kind, dest_data, j, ' ');
11029 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011030 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011031 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011032 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011033 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011034 line_pos++;
11035 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011036 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011037 if (ch == '\n' || ch == '\r')
11038 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011040 }
11041 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011042#ifndef DONT_MAKE_RESULT_READY
11043 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 Py_DECREF(u);
11045 return NULL;
11046 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011047#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011048 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011049 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011050
Antoine Pitroue71d5742011-10-04 15:55:09 +020011051 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011052 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11053 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011054}
11055
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011056PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011057 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011058\n\
11059Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011060such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061arguments start and end are interpreted as in slice notation.\n\
11062\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011063Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Jesus Ceaac451502011-04-20 17:09:23 +020011068 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011069 Py_ssize_t start;
11070 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011071 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
Jesus Ceaac451502011-04-20 17:09:23 +020011073 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11074 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (PyUnicode_READY(self) == -1)
11078 return NULL;
11079 if (PyUnicode_READY(substring) == -1)
11080 return NULL;
11081
Victor Stinner794d5672011-10-10 03:21:36 +020011082 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011084 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085
11086 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011088 if (result == -2)
11089 return NULL;
11090
Christian Heimes217cfd12007-12-02 14:31:20 +000011091 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011092}
11093
11094static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011095unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011097 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11098 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101}
11102
Guido van Rossumc2504932007-09-18 19:42:40 +000011103/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011104 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011105static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000011106unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107{
Guido van Rossumc2504932007-09-18 19:42:40 +000011108 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011109 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 if (_PyUnicode_HASH(self) != -1)
11112 return _PyUnicode_HASH(self);
11113 if (PyUnicode_READY(self) == -1)
11114 return -1;
11115 len = PyUnicode_GET_LENGTH(self);
11116
11117 /* The hash function as a macro, gets expanded three times below. */
11118#define HASH(P) \
11119 x = (Py_uhash_t)*P << 7; \
11120 while (--len >= 0) \
11121 x = (1000003*x) ^ (Py_uhash_t)*P++;
11122
11123 switch (PyUnicode_KIND(self)) {
11124 case PyUnicode_1BYTE_KIND: {
11125 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11126 HASH(c);
11127 break;
11128 }
11129 case PyUnicode_2BYTE_KIND: {
11130 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11131 HASH(s);
11132 break;
11133 }
11134 default: {
11135 Py_UCS4 *l;
11136 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11137 "Impossible switch case in unicode_hash");
11138 l = PyUnicode_4BYTE_DATA(self);
11139 HASH(l);
11140 break;
11141 }
11142 }
11143 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11144
Guido van Rossumc2504932007-09-18 19:42:40 +000011145 if (x == -1)
11146 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011148 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011149}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011152PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011153 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011155Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011156
11157static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011160 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020011161 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011162 Py_ssize_t start;
11163 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011164
Jesus Ceaac451502011-04-20 17:09:23 +020011165 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11166 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011169 if (PyUnicode_READY(self) == -1)
11170 return NULL;
11171 if (PyUnicode_READY(substring) == -1)
11172 return NULL;
11173
Victor Stinner794d5672011-10-10 03:21:36 +020011174 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011175 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011176 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011177
11178 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 if (result == -2)
11181 return NULL;
11182
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183 if (result < 0) {
11184 PyErr_SetString(PyExc_ValueError, "substring not found");
11185 return NULL;
11186 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011187
Christian Heimes217cfd12007-12-02 14:31:20 +000011188 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011189}
11190
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011191PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011192 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011193\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011194Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011195at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196
11197static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011198unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 Py_ssize_t i, length;
11201 int kind;
11202 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203 int cased;
11204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 if (PyUnicode_READY(self) == -1)
11206 return NULL;
11207 length = PyUnicode_GET_LENGTH(self);
11208 kind = PyUnicode_KIND(self);
11209 data = PyUnicode_DATA(self);
11210
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 if (length == 1)
11213 return PyBool_FromLong(
11214 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011216 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011217 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011218 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011219
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011221 for (i = 0; i < length; i++) {
11222 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011223
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11225 return PyBool_FromLong(0);
11226 else if (!cased && Py_UNICODE_ISLOWER(ch))
11227 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011229 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230}
11231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011232PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011233 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011235Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011236at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
11238static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011239unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011241 Py_ssize_t i, length;
11242 int kind;
11243 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244 int cased;
11245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 if (PyUnicode_READY(self) == -1)
11247 return NULL;
11248 length = PyUnicode_GET_LENGTH(self);
11249 kind = PyUnicode_KIND(self);
11250 data = PyUnicode_DATA(self);
11251
Guido van Rossumd57fd912000-03-10 22:53:23 +000011252 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 if (length == 1)
11254 return PyBool_FromLong(
11255 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011256
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011257 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011258 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011260
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 for (i = 0; i < length; i++) {
11263 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011264
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11266 return PyBool_FromLong(0);
11267 else if (!cased && Py_UNICODE_ISUPPER(ch))
11268 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011270 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271}
11272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011273PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011274 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011276Return True if S is a titlecased string and there is at least one\n\
11277character in S, i.e. upper- and titlecase characters may only\n\
11278follow uncased characters and lowercase characters only cased ones.\n\
11279Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
11281static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011282unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 Py_ssize_t i, length;
11285 int kind;
11286 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287 int cased, previous_is_cased;
11288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011289 if (PyUnicode_READY(self) == -1)
11290 return NULL;
11291 length = PyUnicode_GET_LENGTH(self);
11292 kind = PyUnicode_KIND(self);
11293 data = PyUnicode_DATA(self);
11294
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011296 if (length == 1) {
11297 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11298 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11299 (Py_UNICODE_ISUPPER(ch) != 0));
11300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011302 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011304 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011305
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 cased = 0;
11307 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308 for (i = 0; i < length; i++) {
11309 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011310
Benjamin Peterson29060642009-01-31 22:14:21 +000011311 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11312 if (previous_is_cased)
11313 return PyBool_FromLong(0);
11314 previous_is_cased = 1;
11315 cased = 1;
11316 }
11317 else if (Py_UNICODE_ISLOWER(ch)) {
11318 if (!previous_is_cased)
11319 return PyBool_FromLong(0);
11320 previous_is_cased = 1;
11321 cased = 1;
11322 }
11323 else
11324 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011326 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327}
11328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011329PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011330 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011332Return True if all characters in S are whitespace\n\
11333and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334
11335static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011336unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 Py_ssize_t i, length;
11339 int kind;
11340 void *data;
11341
11342 if (PyUnicode_READY(self) == -1)
11343 return NULL;
11344 length = PyUnicode_GET_LENGTH(self);
11345 kind = PyUnicode_KIND(self);
11346 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (length == 1)
11350 return PyBool_FromLong(
11351 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011353 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011354 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011355 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 for (i = 0; i < length; i++) {
11358 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011359 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011360 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011361 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011362 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011363}
11364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011365PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011367\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011368Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011369and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011370
11371static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011372unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011373{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 Py_ssize_t i, length;
11375 int kind;
11376 void *data;
11377
11378 if (PyUnicode_READY(self) == -1)
11379 return NULL;
11380 length = PyUnicode_GET_LENGTH(self);
11381 kind = PyUnicode_KIND(self);
11382 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011384 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 if (length == 1)
11386 return PyBool_FromLong(
11387 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011388
11389 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 for (i = 0; i < length; i++) {
11394 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011395 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011396 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011397 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011398}
11399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011401 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011402\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011403Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011405
11406static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011407unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011408{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 int kind;
11410 void *data;
11411 Py_ssize_t len, i;
11412
11413 if (PyUnicode_READY(self) == -1)
11414 return NULL;
11415
11416 kind = PyUnicode_KIND(self);
11417 data = PyUnicode_DATA(self);
11418 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011419
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011420 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 if (len == 1) {
11422 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11423 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11424 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011425
11426 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011428 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011429
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 for (i = 0; i < len; i++) {
11431 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011432 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011434 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011435 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011436}
11437
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011438PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011441Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011442False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
11444static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011445unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 Py_ssize_t i, length;
11448 int kind;
11449 void *data;
11450
11451 if (PyUnicode_READY(self) == -1)
11452 return NULL;
11453 length = PyUnicode_GET_LENGTH(self);
11454 kind = PyUnicode_KIND(self);
11455 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Guido van Rossumd57fd912000-03-10 22:53:23 +000011457 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 if (length == 1)
11459 return PyBool_FromLong(
11460 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011461
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011462 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011463 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011464 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 for (i = 0; i < length; i++) {
11467 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011468 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011469 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011470 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471}
11472
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011473PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011475\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011476Return True if all characters in S are digits\n\
11477and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478
11479static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011480unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 Py_ssize_t i, length;
11483 int kind;
11484 void *data;
11485
11486 if (PyUnicode_READY(self) == -1)
11487 return NULL;
11488 length = PyUnicode_GET_LENGTH(self);
11489 kind = PyUnicode_KIND(self);
11490 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 if (length == 1) {
11494 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11495 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011498 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011499 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011500 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 for (i = 0; i < length; i++) {
11503 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011504 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011505 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011506 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507}
11508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011509PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011511\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011512Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011513False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514
11515static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011516unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 Py_ssize_t i, length;
11519 int kind;
11520 void *data;
11521
11522 if (PyUnicode_READY(self) == -1)
11523 return NULL;
11524 length = PyUnicode_GET_LENGTH(self);
11525 kind = PyUnicode_KIND(self);
11526 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 if (length == 1)
11530 return PyBool_FromLong(
11531 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011533 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011534 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011535 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 for (i = 0; i < length; i++) {
11538 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011541 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011542}
11543
Martin v. Löwis47383402007-08-15 07:32:56 +000011544int
11545PyUnicode_IsIdentifier(PyObject *self)
11546{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 int kind;
11548 void *data;
11549 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011550 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 if (PyUnicode_READY(self) == -1) {
11553 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 }
11556
11557 /* Special case for empty strings */
11558 if (PyUnicode_GET_LENGTH(self) == 0)
11559 return 0;
11560 kind = PyUnicode_KIND(self);
11561 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011562
11563 /* PEP 3131 says that the first character must be in
11564 XID_Start and subsequent characters in XID_Continue,
11565 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011566 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011567 letters, digits, underscore). However, given the current
11568 definition of XID_Start and XID_Continue, it is sufficient
11569 to check just for these, except that _ must be allowed
11570 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011572 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011573 return 0;
11574
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011575 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011578 return 1;
11579}
11580
11581PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011583\n\
11584Return True if S is a valid identifier according\n\
11585to the language definition.");
11586
11587static PyObject*
11588unicode_isidentifier(PyObject *self)
11589{
11590 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11591}
11592
Georg Brandl559e5d72008-06-11 18:37:52 +000011593PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011595\n\
11596Return True if all characters in S are considered\n\
11597printable in repr() or S is empty, False otherwise.");
11598
11599static PyObject*
11600unicode_isprintable(PyObject *self)
11601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 Py_ssize_t i, length;
11603 int kind;
11604 void *data;
11605
11606 if (PyUnicode_READY(self) == -1)
11607 return NULL;
11608 length = PyUnicode_GET_LENGTH(self);
11609 kind = PyUnicode_KIND(self);
11610 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011611
11612 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 if (length == 1)
11614 return PyBool_FromLong(
11615 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617 for (i = 0; i < length; i++) {
11618 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011619 Py_RETURN_FALSE;
11620 }
11621 }
11622 Py_RETURN_TRUE;
11623}
11624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011625PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011626 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011627\n\
11628Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011629iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630
11631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011632unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011634 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635}
11636
Martin v. Löwis18e16552006-02-15 17:27:45 +000011637static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638unicode_length(PyUnicodeObject *self)
11639{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 if (PyUnicode_READY(self) == -1)
11641 return -1;
11642 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643}
11644
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011645PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011648Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011649done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650
11651static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011652unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011654 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011655 Py_UCS4 fillchar = ' ';
11656
11657 if (PyUnicode_READY(self) == -1)
11658 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011659
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011660 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011661 return NULL;
11662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 Py_INCREF(self);
11665 return (PyObject*) self;
11666 }
11667
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669}
11670
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011671PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011672 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011674Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011675
11676static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011677unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679 return fixup(self, fixlower);
11680}
11681
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011682#define LEFTSTRIP 0
11683#define RIGHTSTRIP 1
11684#define BOTHSTRIP 2
11685
11686/* Arrays indexed by above */
11687static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11688
11689#define STRIPNAME(i) (stripformat[i]+3)
11690
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011691/* externally visible for str.strip(unicode) */
11692PyObject *
11693_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 void *data;
11696 int kind;
11697 Py_ssize_t i, j, len;
11698 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011700 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11701 return NULL;
11702
11703 kind = PyUnicode_KIND(self);
11704 data = PyUnicode_DATA(self);
11705 len = PyUnicode_GET_LENGTH(self);
11706 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11707 PyUnicode_DATA(sepobj),
11708 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011709
Benjamin Peterson14339b62009-01-31 16:36:08 +000011710 i = 0;
11711 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 while (i < len &&
11713 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 i++;
11715 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011716 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011717
Benjamin Peterson14339b62009-01-31 16:36:08 +000011718 j = len;
11719 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 do {
11721 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 } while (j >= i &&
11723 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011725 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
Victor Stinner12bab6d2011-10-01 01:53:49 +020011727 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728}
11729
11730PyObject*
11731PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11732{
11733 unsigned char *data;
11734 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011735 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736
Victor Stinnerde636f32011-10-01 03:55:54 +020011737 if (PyUnicode_READY(self) == -1)
11738 return NULL;
11739
11740 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11741
Victor Stinner12bab6d2011-10-01 01:53:49 +020011742 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011743 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011744 if (PyUnicode_CheckExact(self)) {
11745 Py_INCREF(self);
11746 return self;
11747 }
11748 else
11749 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 }
11751
Victor Stinner12bab6d2011-10-01 01:53:49 +020011752 length = end - start;
11753 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011754 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755
Victor Stinnerde636f32011-10-01 03:55:54 +020011756 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011757 PyErr_SetString(PyExc_IndexError, "string index out of range");
11758 return NULL;
11759 }
11760
Victor Stinnerb9275c12011-10-05 14:01:42 +020011761 if (PyUnicode_IS_ASCII(self)) {
11762 kind = PyUnicode_KIND(self);
11763 data = PyUnicode_1BYTE_DATA(self);
11764 return unicode_fromascii(data + start, length);
11765 }
11766 else {
11767 kind = PyUnicode_KIND(self);
11768 data = PyUnicode_1BYTE_DATA(self);
11769 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011770 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011771 length);
11772 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774
11775static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011776do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778 int kind;
11779 void *data;
11780 Py_ssize_t len, i, j;
11781
11782 if (PyUnicode_READY(self) == -1)
11783 return NULL;
11784
11785 kind = PyUnicode_KIND(self);
11786 data = PyUnicode_DATA(self);
11787 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011788
Benjamin Peterson14339b62009-01-31 16:36:08 +000011789 i = 0;
11790 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 i++;
11793 }
11794 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011795
Benjamin Peterson14339b62009-01-31 16:36:08 +000011796 j = len;
11797 if (striptype != LEFTSTRIP) {
11798 do {
11799 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011801 j++;
11802 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011803
Victor Stinner12bab6d2011-10-01 01:53:49 +020011804 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805}
11806
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011807
11808static PyObject *
11809do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11810{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011811 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011812
Benjamin Peterson14339b62009-01-31 16:36:08 +000011813 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11814 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 if (sep != NULL && sep != Py_None) {
11817 if (PyUnicode_Check(sep))
11818 return _PyUnicode_XStrip(self, striptype, sep);
11819 else {
11820 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 "%s arg must be None or str",
11822 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011823 return NULL;
11824 }
11825 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011826
Benjamin Peterson14339b62009-01-31 16:36:08 +000011827 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011828}
11829
11830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011831PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011832 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011833\n\
11834Return a copy of the string S with leading and trailing\n\
11835whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011836If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011837
11838static PyObject *
11839unicode_strip(PyUnicodeObject *self, PyObject *args)
11840{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011841 if (PyTuple_GET_SIZE(args) == 0)
11842 return do_strip(self, BOTHSTRIP); /* Common case */
11843 else
11844 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011845}
11846
11847
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011848PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011850\n\
11851Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011852If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853
11854static PyObject *
11855unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11856{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011857 if (PyTuple_GET_SIZE(args) == 0)
11858 return do_strip(self, LEFTSTRIP); /* Common case */
11859 else
11860 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011861}
11862
11863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011864PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011866\n\
11867Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011868If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869
11870static PyObject *
11871unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11872{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011873 if (PyTuple_GET_SIZE(args) == 0)
11874 return do_strip(self, RIGHTSTRIP); /* Common case */
11875 else
11876 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011877}
11878
11879
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011881unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882{
11883 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
Georg Brandl222de0f2009-04-12 12:01:50 +000011886 if (len < 1) {
11887 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011888 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011889 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
Tim Peters7a29bd52001-09-12 03:03:31 +000011891 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892 /* no repeat, return original string */
11893 Py_INCREF(str);
11894 return (PyObject*) str;
11895 }
Tim Peters8f422462000-09-09 06:13:41 +000011896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 if (PyUnicode_READY(str) == -1)
11898 return NULL;
11899
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011900 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011901 PyErr_SetString(PyExc_OverflowError,
11902 "repeated string is too long");
11903 return NULL;
11904 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011905 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 if (!u)
11909 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011910 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (PyUnicode_GET_LENGTH(str) == 1) {
11913 const int kind = PyUnicode_KIND(str);
11914 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11915 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011916 if (kind == PyUnicode_1BYTE_KIND)
11917 memset(to, (unsigned char)fill_char, len);
11918 else {
11919 for (n = 0; n < len; ++n)
11920 PyUnicode_WRITE(kind, to, n, fill_char);
11921 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 }
11923 else {
11924 /* number of characters copied this far */
11925 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011926 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 char *to = (char *) PyUnicode_DATA(u);
11928 Py_MEMCPY(to, PyUnicode_DATA(str),
11929 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011931 n = (done <= nchars-done) ? done : nchars-done;
11932 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011933 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935 }
11936
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011937 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 return (PyObject*) u;
11939}
11940
Alexander Belopolsky40018472011-02-26 01:02:56 +000011941PyObject *
11942PyUnicode_Replace(PyObject *obj,
11943 PyObject *subobj,
11944 PyObject *replobj,
11945 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946{
11947 PyObject *self;
11948 PyObject *str1;
11949 PyObject *str2;
11950 PyObject *result;
11951
11952 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011953 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011956 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 Py_DECREF(self);
11958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011959 }
11960 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011961 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011962 Py_DECREF(self);
11963 Py_DECREF(str1);
11964 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011967 Py_DECREF(self);
11968 Py_DECREF(str1);
11969 Py_DECREF(str2);
11970 return result;
11971}
11972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011973PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011974 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975\n\
11976Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011977old replaced by new. If the optional argument count is\n\
11978given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
11980static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 PyObject *str1;
11984 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011985 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986 PyObject *result;
11987
Martin v. Löwis18e16552006-02-15 17:27:45 +000011988 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011991 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 str1 = PyUnicode_FromObject(str1);
11993 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11994 return NULL;
11995 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011996 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 Py_DECREF(str1);
11998 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011999 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
12001 result = replace(self, str1, str2, maxcount);
12002
12003 Py_DECREF(str1);
12004 Py_DECREF(str2);
12005 return result;
12006}
12007
Alexander Belopolsky40018472011-02-26 01:02:56 +000012008static PyObject *
12009unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012011 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012012 Py_ssize_t isize;
12013 Py_ssize_t osize, squote, dquote, i, o;
12014 Py_UCS4 max, quote;
12015 int ikind, okind;
12016 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012019 return NULL;
12020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 isize = PyUnicode_GET_LENGTH(unicode);
12022 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 /* Compute length of output, quote characters, and
12025 maximum character */
12026 osize = 2; /* quotes */
12027 max = 127;
12028 squote = dquote = 0;
12029 ikind = PyUnicode_KIND(unicode);
12030 for (i = 0; i < isize; i++) {
12031 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12032 switch (ch) {
12033 case '\'': squote++; osize++; break;
12034 case '"': dquote++; osize++; break;
12035 case '\\': case '\t': case '\r': case '\n':
12036 osize += 2; break;
12037 default:
12038 /* Fast-path ASCII */
12039 if (ch < ' ' || ch == 0x7f)
12040 osize += 4; /* \xHH */
12041 else if (ch < 0x7f)
12042 osize++;
12043 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12044 osize++;
12045 max = ch > max ? ch : max;
12046 }
12047 else if (ch < 0x100)
12048 osize += 4; /* \xHH */
12049 else if (ch < 0x10000)
12050 osize += 6; /* \uHHHH */
12051 else
12052 osize += 10; /* \uHHHHHHHH */
12053 }
12054 }
12055
12056 quote = '\'';
12057 if (squote) {
12058 if (dquote)
12059 /* Both squote and dquote present. Use squote,
12060 and escape them */
12061 osize += squote;
12062 else
12063 quote = '"';
12064 }
12065
12066 repr = PyUnicode_New(osize, max);
12067 if (repr == NULL)
12068 return NULL;
12069 okind = PyUnicode_KIND(repr);
12070 odata = PyUnicode_DATA(repr);
12071
12072 PyUnicode_WRITE(okind, odata, 0, quote);
12073 PyUnicode_WRITE(okind, odata, osize-1, quote);
12074
12075 for (i = 0, o = 1; i < isize; i++) {
12076 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012077
12078 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012079 if ((ch == quote) || (ch == '\\')) {
12080 PyUnicode_WRITE(okind, odata, o++, '\\');
12081 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012082 continue;
12083 }
12084
Benjamin Peterson29060642009-01-31 22:14:21 +000012085 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012086 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012087 PyUnicode_WRITE(okind, odata, o++, '\\');
12088 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012089 }
12090 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012091 PyUnicode_WRITE(okind, odata, o++, '\\');
12092 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012093 }
12094 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012095 PyUnicode_WRITE(okind, odata, o++, '\\');
12096 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012097 }
12098
12099 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012100 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012101 PyUnicode_WRITE(okind, odata, o++, '\\');
12102 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12104 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012105 }
12106
Georg Brandl559e5d72008-06-11 18:37:52 +000012107 /* Copy ASCII characters as-is */
12108 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012110 }
12111
Benjamin Peterson29060642009-01-31 22:14:21 +000012112 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012113 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012114 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012115 (categories Z* and C* except ASCII space)
12116 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012118 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (ch <= 0xff) {
12120 PyUnicode_WRITE(okind, odata, o++, '\\');
12121 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12123 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012124 }
12125 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 else if (ch >= 0x10000) {
12127 PyUnicode_WRITE(okind, odata, o++, '\\');
12128 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12130 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12131 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12136 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012137 }
12138 /* Map 16-bit characters to '\uxxxx' */
12139 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 PyUnicode_WRITE(okind, odata, o++, '\\');
12141 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012142 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12143 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12144 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12145 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012146 }
12147 }
12148 /* Copy characters as-is */
12149 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012151 }
12152 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012153 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012155 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012156 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157}
12158
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012159PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012160 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012161\n\
12162Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012163such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164arguments start and end are interpreted as in slice notation.\n\
12165\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012166Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167
12168static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170{
Jesus Ceaac451502011-04-20 17:09:23 +020012171 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012172 Py_ssize_t start;
12173 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012174 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175
Jesus Ceaac451502011-04-20 17:09:23 +020012176 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12177 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012180 if (PyUnicode_READY(self) == -1)
12181 return NULL;
12182 if (PyUnicode_READY(substring) == -1)
12183 return NULL;
12184
Victor Stinner794d5672011-10-10 03:21:36 +020012185 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012187 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
12189 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (result == -2)
12192 return NULL;
12193
Christian Heimes217cfd12007-12-02 14:31:20 +000012194 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195}
12196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012197PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012198 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012200Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
12202static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204{
Jesus Ceaac451502011-04-20 17:09:23 +020012205 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012206 Py_ssize_t start;
12207 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012208 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
Jesus Ceaac451502011-04-20 17:09:23 +020012210 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12211 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (PyUnicode_READY(self) == -1)
12215 return NULL;
12216 if (PyUnicode_READY(substring) == -1)
12217 return NULL;
12218
Victor Stinner794d5672011-10-10 03:21:36 +020012219 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012221 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222
12223 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (result == -2)
12226 return NULL;
12227
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 if (result < 0) {
12229 PyErr_SetString(PyExc_ValueError, "substring not found");
12230 return NULL;
12231 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232
Christian Heimes217cfd12007-12-02 14:31:20 +000012233 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234}
12235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012236PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012239Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012240done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
12242static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012243unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012245 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246 Py_UCS4 fillchar = ' ';
12247
Victor Stinnere9a29352011-10-01 02:14:59 +020012248 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012250
Victor Stinnere9a29352011-10-01 02:14:59 +020012251 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252 return NULL;
12253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012254 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 Py_INCREF(self);
12256 return (PyObject*) self;
12257 }
12258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012259 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260}
12261
Alexander Belopolsky40018472011-02-26 01:02:56 +000012262PyObject *
12263PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264{
12265 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012266
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 s = PyUnicode_FromObject(s);
12268 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012269 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012270 if (sep != NULL) {
12271 sep = PyUnicode_FromObject(sep);
12272 if (sep == NULL) {
12273 Py_DECREF(s);
12274 return NULL;
12275 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276 }
12277
Victor Stinner9310abb2011-10-05 00:59:23 +020012278 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279
12280 Py_DECREF(s);
12281 Py_XDECREF(sep);
12282 return result;
12283}
12284
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012285PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012286 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287\n\
12288Return a list of the words in S, using sep as the\n\
12289delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012290splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012291whitespace string is a separator and empty strings are\n\
12292removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293
12294static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012295unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296{
12297 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012298 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299
Martin v. Löwis18e16552006-02-15 17:27:45 +000012300 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301 return NULL;
12302
12303 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012306 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012307 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309}
12310
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311PyObject *
12312PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12313{
12314 PyObject* str_obj;
12315 PyObject* sep_obj;
12316 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 int kind1, kind2, kind;
12318 void *buf1 = NULL, *buf2 = NULL;
12319 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012320
12321 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012322 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012324 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012326 Py_DECREF(str_obj);
12327 return NULL;
12328 }
12329
Victor Stinner14f8f022011-10-05 20:58:25 +020012330 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012332 kind = Py_MAX(kind1, kind2);
12333 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012335 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 if (!buf1)
12337 goto onError;
12338 buf2 = PyUnicode_DATA(sep_obj);
12339 if (kind2 != kind)
12340 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12341 if (!buf2)
12342 goto onError;
12343 len1 = PyUnicode_GET_LENGTH(str_obj);
12344 len2 = PyUnicode_GET_LENGTH(sep_obj);
12345
Victor Stinner14f8f022011-10-05 20:58:25 +020012346 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012348 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12349 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12350 else
12351 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 break;
12353 case PyUnicode_2BYTE_KIND:
12354 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12355 break;
12356 case PyUnicode_4BYTE_KIND:
12357 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12358 break;
12359 default:
12360 assert(0);
12361 out = 0;
12362 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012363
12364 Py_DECREF(sep_obj);
12365 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 if (kind1 != kind)
12367 PyMem_Free(buf1);
12368 if (kind2 != kind)
12369 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012370
12371 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 onError:
12373 Py_DECREF(sep_obj);
12374 Py_DECREF(str_obj);
12375 if (kind1 != kind && buf1)
12376 PyMem_Free(buf1);
12377 if (kind2 != kind && buf2)
12378 PyMem_Free(buf2);
12379 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012380}
12381
12382
12383PyObject *
12384PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12385{
12386 PyObject* str_obj;
12387 PyObject* sep_obj;
12388 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 int kind1, kind2, kind;
12390 void *buf1 = NULL, *buf2 = NULL;
12391 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012392
12393 str_obj = PyUnicode_FromObject(str_in);
12394 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396 sep_obj = PyUnicode_FromObject(sep_in);
12397 if (!sep_obj) {
12398 Py_DECREF(str_obj);
12399 return NULL;
12400 }
12401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012402 kind1 = PyUnicode_KIND(str_in);
12403 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012404 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 buf1 = PyUnicode_DATA(str_in);
12406 if (kind1 != kind)
12407 buf1 = _PyUnicode_AsKind(str_in, kind);
12408 if (!buf1)
12409 goto onError;
12410 buf2 = PyUnicode_DATA(sep_obj);
12411 if (kind2 != kind)
12412 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12413 if (!buf2)
12414 goto onError;
12415 len1 = PyUnicode_GET_LENGTH(str_obj);
12416 len2 = PyUnicode_GET_LENGTH(sep_obj);
12417
12418 switch(PyUnicode_KIND(str_in)) {
12419 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012420 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12421 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12422 else
12423 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012424 break;
12425 case PyUnicode_2BYTE_KIND:
12426 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12427 break;
12428 case PyUnicode_4BYTE_KIND:
12429 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12430 break;
12431 default:
12432 assert(0);
12433 out = 0;
12434 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012435
12436 Py_DECREF(sep_obj);
12437 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 if (kind1 != kind)
12439 PyMem_Free(buf1);
12440 if (kind2 != kind)
12441 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012442
12443 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 onError:
12445 Py_DECREF(sep_obj);
12446 Py_DECREF(str_obj);
12447 if (kind1 != kind && buf1)
12448 PyMem_Free(buf1);
12449 if (kind2 != kind && buf2)
12450 PyMem_Free(buf2);
12451 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012452}
12453
12454PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012456\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012457Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012458the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012459found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012460
12461static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012462unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012463{
Victor Stinner9310abb2011-10-05 00:59:23 +020012464 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012465}
12466
12467PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012468 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012469\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012470Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012471the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012472separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012473
12474static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012475unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476{
Victor Stinner9310abb2011-10-05 00:59:23 +020012477 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012478}
12479
Alexander Belopolsky40018472011-02-26 01:02:56 +000012480PyObject *
12481PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012482{
12483 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012484
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485 s = PyUnicode_FromObject(s);
12486 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012487 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012488 if (sep != NULL) {
12489 sep = PyUnicode_FromObject(sep);
12490 if (sep == NULL) {
12491 Py_DECREF(s);
12492 return NULL;
12493 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012494 }
12495
Victor Stinner9310abb2011-10-05 00:59:23 +020012496 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497
12498 Py_DECREF(s);
12499 Py_XDECREF(sep);
12500 return result;
12501}
12502
12503PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012504 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012505\n\
12506Return a list of the words in S, using sep as the\n\
12507delimiter string, starting at the end of the string and\n\
12508working to the front. If maxsplit is given, at most maxsplit\n\
12509splits are done. If sep is not specified, any whitespace string\n\
12510is a separator.");
12511
12512static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012513unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012514{
12515 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012516 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012517
Martin v. Löwis18e16552006-02-15 17:27:45 +000012518 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012519 return NULL;
12520
12521 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012522 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012523 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012524 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012525 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012526 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012527}
12528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012529PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012530 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531\n\
12532Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012533Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012534is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
12536static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012537unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012539 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012540 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012542 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12543 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544 return NULL;
12545
Guido van Rossum86662912000-04-11 15:38:46 +000012546 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547}
12548
12549static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012550PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551{
Walter Dörwald346737f2007-05-31 10:44:43 +000012552 if (PyUnicode_CheckExact(self)) {
12553 Py_INCREF(self);
12554 return self;
12555 } else
12556 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012557 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558}
12559
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012560PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012562\n\
12563Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012564and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
12566static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012567unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569 return fixup(self, fixswapcase);
12570}
12571
Georg Brandlceee0772007-11-27 23:48:05 +000012572PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012574\n\
12575Return a translation table usable for str.translate().\n\
12576If there is only one argument, it must be a dictionary mapping Unicode\n\
12577ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012578Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012579If there are two arguments, they must be strings of equal length, and\n\
12580in the resulting dictionary, each character in x will be mapped to the\n\
12581character at the same position in y. If there is a third argument, it\n\
12582must be a string, whose characters will be mapped to None in the result.");
12583
12584static PyObject*
12585unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12586{
12587 PyObject *x, *y = NULL, *z = NULL;
12588 PyObject *new = NULL, *key, *value;
12589 Py_ssize_t i = 0;
12590 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591
Georg Brandlceee0772007-11-27 23:48:05 +000012592 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12593 return NULL;
12594 new = PyDict_New();
12595 if (!new)
12596 return NULL;
12597 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 int x_kind, y_kind, z_kind;
12599 void *x_data, *y_data, *z_data;
12600
Georg Brandlceee0772007-11-27 23:48:05 +000012601 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012602 if (!PyUnicode_Check(x)) {
12603 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12604 "be a string if there is a second argument");
12605 goto err;
12606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012607 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012608 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12609 "arguments must have equal length");
12610 goto err;
12611 }
12612 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 x_kind = PyUnicode_KIND(x);
12614 y_kind = PyUnicode_KIND(y);
12615 x_data = PyUnicode_DATA(x);
12616 y_data = PyUnicode_DATA(y);
12617 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12618 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12619 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012620 if (!key || !value)
12621 goto err;
12622 res = PyDict_SetItem(new, key, value);
12623 Py_DECREF(key);
12624 Py_DECREF(value);
12625 if (res < 0)
12626 goto err;
12627 }
12628 /* create entries for deleting chars in z */
12629 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 z_kind = PyUnicode_KIND(z);
12631 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012632 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012634 if (!key)
12635 goto err;
12636 res = PyDict_SetItem(new, key, Py_None);
12637 Py_DECREF(key);
12638 if (res < 0)
12639 goto err;
12640 }
12641 }
12642 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 int kind;
12644 void *data;
12645
Georg Brandlceee0772007-11-27 23:48:05 +000012646 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012647 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012648 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12649 "to maketrans it must be a dict");
12650 goto err;
12651 }
12652 /* copy entries into the new dict, converting string keys to int keys */
12653 while (PyDict_Next(x, &i, &key, &value)) {
12654 if (PyUnicode_Check(key)) {
12655 /* convert string keys to integer keys */
12656 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012657 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012658 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12659 "table must be of length 1");
12660 goto err;
12661 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 kind = PyUnicode_KIND(key);
12663 data = PyUnicode_DATA(key);
12664 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012665 if (!newkey)
12666 goto err;
12667 res = PyDict_SetItem(new, newkey, value);
12668 Py_DECREF(newkey);
12669 if (res < 0)
12670 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012671 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012672 /* just keep integer keys */
12673 if (PyDict_SetItem(new, key, value) < 0)
12674 goto err;
12675 } else {
12676 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12677 "be strings or integers");
12678 goto err;
12679 }
12680 }
12681 }
12682 return new;
12683 err:
12684 Py_DECREF(new);
12685 return NULL;
12686}
12687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012688PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012689 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690\n\
12691Return a copy of the string S, where all characters have been mapped\n\
12692through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012693Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012694Unmapped characters are left untouched. Characters mapped to None\n\
12695are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696
12697static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701}
12702
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012703PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012704 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012706Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707
12708static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012709unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012711 return fixup(self, fixupper);
12712}
12713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012714PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012716\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012717Pad a numeric string S with zeros on the left, to fill a field\n\
12718of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719
12720static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012721unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012723 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012724 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012725 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 int kind;
12727 void *data;
12728 Py_UCS4 chr;
12729
12730 if (PyUnicode_READY(self) == -1)
12731 return NULL;
12732
Martin v. Löwis18e16552006-02-15 17:27:45 +000012733 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012734 return NULL;
12735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012737 if (PyUnicode_CheckExact(self)) {
12738 Py_INCREF(self);
12739 return (PyObject*) self;
12740 }
12741 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012742 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743 }
12744
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746
12747 u = pad(self, fill, 0, '0');
12748
Walter Dörwald068325e2002-04-15 13:36:47 +000012749 if (u == NULL)
12750 return NULL;
12751
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 kind = PyUnicode_KIND(u);
12753 data = PyUnicode_DATA(u);
12754 chr = PyUnicode_READ(kind, data, fill);
12755
12756 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012757 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012758 PyUnicode_WRITE(kind, data, 0, chr);
12759 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760 }
12761
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012762 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763 return (PyObject*) u;
12764}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012765
12766#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012767static PyObject *
12768unicode__decimal2ascii(PyObject *self)
12769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012771}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012772#endif
12773
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012774PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012775 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012777Return True if S starts with the specified prefix, False otherwise.\n\
12778With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012779With optional end, stop comparing S at that position.\n\
12780prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012781
12782static PyObject *
12783unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012784 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012785{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012786 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012788 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012789 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012790 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791
Jesus Ceaac451502011-04-20 17:09:23 +020012792 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012793 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012794 if (PyTuple_Check(subobj)) {
12795 Py_ssize_t i;
12796 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12797 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012798 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012799 if (substring == NULL)
12800 return NULL;
12801 result = tailmatch(self, substring, start, end, -1);
12802 Py_DECREF(substring);
12803 if (result) {
12804 Py_RETURN_TRUE;
12805 }
12806 }
12807 /* nothing matched */
12808 Py_RETURN_FALSE;
12809 }
12810 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012811 if (substring == NULL) {
12812 if (PyErr_ExceptionMatches(PyExc_TypeError))
12813 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12814 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012816 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012817 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012819 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820}
12821
12822
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012823PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012826Return True if S ends with the specified suffix, False otherwise.\n\
12827With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012828With optional end, stop comparing S at that position.\n\
12829suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830
12831static PyObject *
12832unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012835 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012837 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012838 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012839 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840
Jesus Ceaac451502011-04-20 17:09:23 +020012841 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012842 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012843 if (PyTuple_Check(subobj)) {
12844 Py_ssize_t i;
12845 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12846 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012848 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012850 result = tailmatch(self, substring, start, end, +1);
12851 Py_DECREF(substring);
12852 if (result) {
12853 Py_RETURN_TRUE;
12854 }
12855 }
12856 Py_RETURN_FALSE;
12857 }
12858 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012859 if (substring == NULL) {
12860 if (PyErr_ExceptionMatches(PyExc_TypeError))
12861 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12862 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012864 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012865 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012867 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868}
12869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012871
12872PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012873 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012874\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012875Return a formatted version of S, using substitutions from args and kwargs.\n\
12876The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012877
Eric Smith27bbca62010-11-04 17:06:58 +000012878PyDoc_STRVAR(format_map__doc__,
12879 "S.format_map(mapping) -> str\n\
12880\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012881Return a formatted version of S, using substitutions from mapping.\n\
12882The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012883
Eric Smith4a7d76d2008-05-30 18:10:19 +000012884static PyObject *
12885unicode__format__(PyObject* self, PyObject* args)
12886{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012887 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012888
12889 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12890 return NULL;
12891
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012892 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012894 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012895}
12896
Eric Smith8c663262007-08-25 02:26:07 +000012897PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012898 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012899\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012900Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012901
12902static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012903unicode__sizeof__(PyUnicodeObject *v)
12904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012905 Py_ssize_t size;
12906
12907 /* If it's a compact object, account for base structure +
12908 character data. */
12909 if (PyUnicode_IS_COMPACT_ASCII(v))
12910 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12911 else if (PyUnicode_IS_COMPACT(v))
12912 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012913 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 else {
12915 /* If it is a two-block object, account for base object, and
12916 for character block if present. */
12917 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012918 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012919 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012920 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012921 }
12922 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012923 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012924 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012926 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012927 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928
12929 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012930}
12931
12932PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012934
12935static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012936unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012937{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012938 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 if (!copy)
12940 return NULL;
12941 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012942}
12943
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944static PyMethodDef unicode_methods[] = {
12945
12946 /* Order is according to common usage: often used methods should
12947 appear first, since lookup is done sequentially. */
12948
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012949 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012950 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12951 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012952 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012953 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12954 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12955 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12956 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12957 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12958 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12959 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012960 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012961 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12962 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12963 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012964 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012965 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12966 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12967 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012968 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012970 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012971 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012972 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12973 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12974 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12975 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12976 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12977 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12978 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12979 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12980 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12981 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12982 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12983 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12984 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12985 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012986 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012987 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012988 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012989 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012990 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012991 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012992 {"maketrans", (PyCFunction) unicode_maketrans,
12993 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012994 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012995#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012996 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012997#endif
12998
12999#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013000 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013001 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002#endif
13003
Benjamin Peterson14339b62009-01-31 16:36:08 +000013004 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005 {NULL, NULL}
13006};
13007
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013008static PyObject *
13009unicode_mod(PyObject *v, PyObject *w)
13010{
Brian Curtindfc80e32011-08-10 20:28:54 -050013011 if (!PyUnicode_Check(v))
13012 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013014}
13015
13016static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013017 0, /*nb_add*/
13018 0, /*nb_subtract*/
13019 0, /*nb_multiply*/
13020 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013021};
13022
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013024 (lenfunc) unicode_length, /* sq_length */
13025 PyUnicode_Concat, /* sq_concat */
13026 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13027 (ssizeargfunc) unicode_getitem, /* sq_item */
13028 0, /* sq_slice */
13029 0, /* sq_ass_item */
13030 0, /* sq_ass_slice */
13031 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032};
13033
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013034static PyObject*
13035unicode_subscript(PyUnicodeObject* self, PyObject* item)
13036{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 if (PyUnicode_READY(self) == -1)
13038 return NULL;
13039
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013040 if (PyIndex_Check(item)) {
13041 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013042 if (i == -1 && PyErr_Occurred())
13043 return NULL;
13044 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013045 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020013046 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013047 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013048 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013049 PyObject *result;
13050 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013051 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013052 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013055 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013056 return NULL;
13057 }
13058
13059 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013060 return PyUnicode_New(0, 0);
13061 } else if (start == 0 && step == 1 &&
13062 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013063 PyUnicode_CheckExact(self)) {
13064 Py_INCREF(self);
13065 return (PyObject *)self;
13066 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020013067 return PyUnicode_Substring((PyObject*)self,
13068 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013069 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013070 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013071 src_kind = PyUnicode_KIND(self);
13072 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013073 if (!PyUnicode_IS_ASCII(self)) {
13074 kind_limit = kind_maxchar_limit(src_kind);
13075 max_char = 0;
13076 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13077 ch = PyUnicode_READ(src_kind, src_data, cur);
13078 if (ch > max_char) {
13079 max_char = ch;
13080 if (max_char >= kind_limit)
13081 break;
13082 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013083 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013084 }
Victor Stinner55c99112011-10-13 01:17:06 +020013085 else
13086 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013087 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013088 if (result == NULL)
13089 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013090 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013091 dest_data = PyUnicode_DATA(result);
13092
13093 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013094 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13095 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013096 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013097 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013098 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013099 } else {
13100 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13101 return NULL;
13102 }
13103}
13104
13105static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013106 (lenfunc)unicode_length, /* mp_length */
13107 (binaryfunc)unicode_subscript, /* mp_subscript */
13108 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013109};
13110
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112/* Helpers for PyUnicode_Format() */
13113
13114static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013115getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013117 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013119 (*p_argidx)++;
13120 if (arglen < 0)
13121 return args;
13122 else
13123 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 }
13125 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127 return NULL;
13128}
13129
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013130/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013131
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013132static PyObject *
13133formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013135 char *p;
13136 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013138
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 x = PyFloat_AsDouble(v);
13140 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013141 return NULL;
13142
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013145
Eric Smith0923d1d2009-04-16 20:16:10 +000013146 p = PyOS_double_to_string(x, type, prec,
13147 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013148 if (p == NULL)
13149 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013151 PyMem_Free(p);
13152 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153}
13154
Tim Peters38fd5b62000-09-21 05:43:11 +000013155static PyObject*
13156formatlong(PyObject *val, int flags, int prec, int type)
13157{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013158 char *buf;
13159 int len;
13160 PyObject *str; /* temporary string object. */
13161 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013162
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13164 if (!str)
13165 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013167 Py_DECREF(str);
13168 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013169}
13170
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013171static Py_UCS4
13172formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013173{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013174 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013175 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013177 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 goto onError;
13180 }
13181 else {
13182 /* Integer input truncated to a character */
13183 long x;
13184 x = PyLong_AsLong(v);
13185 if (x == -1 && PyErr_Occurred())
13186 goto onError;
13187
13188 if (x < 0 || x > 0x10ffff) {
13189 PyErr_SetString(PyExc_OverflowError,
13190 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013191 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013192 }
13193
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013194 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013195 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013196
Benjamin Peterson29060642009-01-31 22:14:21 +000013197 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013198 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013199 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013200 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013201}
13202
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013203static int
13204repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13205{
13206 int r;
13207 assert(count > 0);
13208 assert(PyUnicode_Check(obj));
13209 if (count > 5) {
13210 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
13211 if (repeated == NULL)
13212 return -1;
13213 r = _PyAccu_Accumulate(acc, repeated);
13214 Py_DECREF(repeated);
13215 return r;
13216 }
13217 else {
13218 do {
13219 if (_PyAccu_Accumulate(acc, obj))
13220 return -1;
13221 } while (--count);
13222 return 0;
13223 }
13224}
13225
Alexander Belopolsky40018472011-02-26 01:02:56 +000013226PyObject *
13227PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 void *fmt;
13230 int fmtkind;
13231 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013233 int r;
13234 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013236 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013237 PyObject *temp = NULL;
13238 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013239 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013240 _PyAccu acc;
13241 static PyObject *plus, *minus, *blank, *zero, *percent;
13242
13243 if (!plus && !(plus = get_latin1_char('+')))
13244 return NULL;
13245 if (!minus && !(minus = get_latin1_char('-')))
13246 return NULL;
13247 if (!blank && !(blank = get_latin1_char(' ')))
13248 return NULL;
13249 if (!zero && !(zero = get_latin1_char('0')))
13250 return NULL;
13251 if (!percent && !(percent = get_latin1_char('%')))
13252 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013253
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013255 PyErr_BadInternalCall();
13256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013258 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
13259 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013260 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013261 if (_PyAccu_Init(&acc))
13262 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013263 fmt = PyUnicode_DATA(uformat);
13264 fmtkind = PyUnicode_KIND(uformat);
13265 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13266 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 arglen = PyTuple_Size(args);
13270 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271 }
13272 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 arglen = -1;
13274 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013275 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013276 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013277 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279
13280 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013281 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013282 PyObject *nonfmt;
13283 Py_ssize_t nonfmtpos;
13284 nonfmtpos = fmtpos++;
13285 while (fmtcnt >= 0 &&
13286 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13287 fmtpos++;
13288 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013289 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013290 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
13291 if (nonfmt == NULL)
13292 goto onError;
13293 r = _PyAccu_Accumulate(&acc, nonfmt);
13294 Py_DECREF(nonfmt);
13295 if (r)
13296 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013297 }
13298 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 /* Got a format specifier */
13300 int flags = 0;
13301 Py_ssize_t width = -1;
13302 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013303 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013304 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013305 int isnumok;
13306 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013307 void *pbuf = NULL;
13308 Py_ssize_t pindex, len;
13309 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013311 fmtpos++;
13312 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13313 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013314 Py_ssize_t keylen;
13315 PyObject *key;
13316 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013317
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 if (dict == NULL) {
13319 PyErr_SetString(PyExc_TypeError,
13320 "format requires a mapping");
13321 goto onError;
13322 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013324 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013325 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013326 /* Skip over balanced parentheses */
13327 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013329 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013331 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013335 if (fmtcnt < 0 || pcount > 0) {
13336 PyErr_SetString(PyExc_ValueError,
13337 "incomplete format key");
13338 goto onError;
13339 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020013340 key = PyUnicode_Substring((PyObject*)uformat,
13341 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013342 if (key == NULL)
13343 goto onError;
13344 if (args_owned) {
13345 Py_DECREF(args);
13346 args_owned = 0;
13347 }
13348 args = PyObject_GetItem(dict, key);
13349 Py_DECREF(key);
13350 if (args == NULL) {
13351 goto onError;
13352 }
13353 args_owned = 1;
13354 arglen = -1;
13355 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013356 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013357 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013359 case '-': flags |= F_LJUST; continue;
13360 case '+': flags |= F_SIGN; continue;
13361 case ' ': flags |= F_BLANK; continue;
13362 case '#': flags |= F_ALT; continue;
13363 case '0': flags |= F_ZERO; continue;
13364 }
13365 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013366 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 if (c == '*') {
13368 v = getnextarg(args, arglen, &argidx);
13369 if (v == NULL)
13370 goto onError;
13371 if (!PyLong_Check(v)) {
13372 PyErr_SetString(PyExc_TypeError,
13373 "* wants int");
13374 goto onError;
13375 }
13376 width = PyLong_AsLong(v);
13377 if (width == -1 && PyErr_Occurred())
13378 goto onError;
13379 if (width < 0) {
13380 flags |= F_LJUST;
13381 width = -width;
13382 }
13383 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013384 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 }
13386 else if (c >= '0' && c <= '9') {
13387 width = c - '0';
13388 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013389 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 if (c < '0' || c > '9')
13391 break;
13392 if ((width*10) / 10 != width) {
13393 PyErr_SetString(PyExc_ValueError,
13394 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013395 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 }
13397 width = width*10 + (c - '0');
13398 }
13399 }
13400 if (c == '.') {
13401 prec = 0;
13402 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013403 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013404 if (c == '*') {
13405 v = getnextarg(args, arglen, &argidx);
13406 if (v == NULL)
13407 goto onError;
13408 if (!PyLong_Check(v)) {
13409 PyErr_SetString(PyExc_TypeError,
13410 "* wants int");
13411 goto onError;
13412 }
13413 prec = PyLong_AsLong(v);
13414 if (prec == -1 && PyErr_Occurred())
13415 goto onError;
13416 if (prec < 0)
13417 prec = 0;
13418 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013420 }
13421 else if (c >= '0' && c <= '9') {
13422 prec = c - '0';
13423 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 if (c < '0' || c > '9')
13426 break;
13427 if ((prec*10) / 10 != prec) {
13428 PyErr_SetString(PyExc_ValueError,
13429 "prec too big");
13430 goto onError;
13431 }
13432 prec = prec*10 + (c - '0');
13433 }
13434 }
13435 } /* prec */
13436 if (fmtcnt >= 0) {
13437 if (c == 'h' || c == 'l' || c == 'L') {
13438 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013439 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013440 }
13441 }
13442 if (fmtcnt < 0) {
13443 PyErr_SetString(PyExc_ValueError,
13444 "incomplete format");
13445 goto onError;
13446 }
13447 if (c != '%') {
13448 v = getnextarg(args, arglen, &argidx);
13449 if (v == NULL)
13450 goto onError;
13451 }
13452 sign = 0;
13453 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013454 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 switch (c) {
13456
13457 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013458 _PyAccu_Accumulate(&acc, percent);
13459 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013460
13461 case 's':
13462 case 'r':
13463 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013464 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 temp = v;
13466 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013467 }
13468 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 if (c == 's')
13470 temp = PyObject_Str(v);
13471 else if (c == 'r')
13472 temp = PyObject_Repr(v);
13473 else
13474 temp = PyObject_ASCII(v);
13475 if (temp == NULL)
13476 goto onError;
13477 if (PyUnicode_Check(temp))
13478 /* nothing to do */;
13479 else {
13480 Py_DECREF(temp);
13481 PyErr_SetString(PyExc_TypeError,
13482 "%s argument has non-string str()");
13483 goto onError;
13484 }
13485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486 if (PyUnicode_READY(temp) == -1) {
13487 Py_CLEAR(temp);
13488 goto onError;
13489 }
13490 pbuf = PyUnicode_DATA(temp);
13491 kind = PyUnicode_KIND(temp);
13492 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013493 if (prec >= 0 && len > prec)
13494 len = prec;
13495 break;
13496
13497 case 'i':
13498 case 'd':
13499 case 'u':
13500 case 'o':
13501 case 'x':
13502 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013503 isnumok = 0;
13504 if (PyNumber_Check(v)) {
13505 PyObject *iobj=NULL;
13506
13507 if (PyLong_Check(v)) {
13508 iobj = v;
13509 Py_INCREF(iobj);
13510 }
13511 else {
13512 iobj = PyNumber_Long(v);
13513 }
13514 if (iobj!=NULL) {
13515 if (PyLong_Check(iobj)) {
13516 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013517 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013518 Py_DECREF(iobj);
13519 if (!temp)
13520 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013521 if (PyUnicode_READY(temp) == -1) {
13522 Py_CLEAR(temp);
13523 goto onError;
13524 }
13525 pbuf = PyUnicode_DATA(temp);
13526 kind = PyUnicode_KIND(temp);
13527 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 sign = 1;
13529 }
13530 else {
13531 Py_DECREF(iobj);
13532 }
13533 }
13534 }
13535 if (!isnumok) {
13536 PyErr_Format(PyExc_TypeError,
13537 "%%%c format: a number is required, "
13538 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13539 goto onError;
13540 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013541 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013543 fillobj = zero;
13544 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 break;
13546
13547 case 'e':
13548 case 'E':
13549 case 'f':
13550 case 'F':
13551 case 'g':
13552 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013553 temp = formatfloat(v, flags, prec, c);
13554 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013556 if (PyUnicode_READY(temp) == -1) {
13557 Py_CLEAR(temp);
13558 goto onError;
13559 }
13560 pbuf = PyUnicode_DATA(temp);
13561 kind = PyUnicode_KIND(temp);
13562 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013563 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013565 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013566 fillobj = zero;
13567 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 break;
13569
13570 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013571 {
13572 Py_UCS4 ch = formatchar(v);
13573 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013574 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013575 temp = _PyUnicode_FromUCS4(&ch, 1);
13576 if (temp == NULL)
13577 goto onError;
13578 pbuf = PyUnicode_DATA(temp);
13579 kind = PyUnicode_KIND(temp);
13580 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013581 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013582 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013583
13584 default:
13585 PyErr_Format(PyExc_ValueError,
13586 "unsupported format character '%c' (0x%x) "
13587 "at index %zd",
13588 (31<=c && c<=126) ? (char)c : '?',
13589 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013590 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 goto onError;
13592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013593 /* pbuf is initialized here. */
13594 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013596 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13597 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013599 pindex++;
13600 }
13601 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13602 signobj = plus;
13603 len--;
13604 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013605 }
13606 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013607 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013609 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 else
13611 sign = 0;
13612 }
13613 if (width < len)
13614 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013615 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013616 if (fill != ' ') {
13617 assert(signobj != NULL);
13618 if (_PyAccu_Accumulate(&acc, signobj))
13619 goto onError;
13620 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013621 if (width > len)
13622 width--;
13623 }
13624 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013625 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013626 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013627 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013628 second = get_latin1_char(
13629 PyUnicode_READ(kind, pbuf, pindex + 1));
13630 pindex += 2;
13631 if (second == NULL ||
13632 _PyAccu_Accumulate(&acc, zero) ||
13633 _PyAccu_Accumulate(&acc, second))
13634 goto onError;
13635 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013637 width -= 2;
13638 if (width < 0)
13639 width = 0;
13640 len -= 2;
13641 }
13642 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013643 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013644 if (repeat_accumulate(&acc, fillobj, width - len))
13645 goto onError;
13646 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013647 }
13648 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013649 if (sign) {
13650 assert(signobj != NULL);
13651 if (_PyAccu_Accumulate(&acc, signobj))
13652 goto onError;
13653 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013654 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013655 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13656 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013657 second = get_latin1_char(
13658 PyUnicode_READ(kind, pbuf, pindex + 1));
13659 pindex += 2;
13660 if (second == NULL ||
13661 _PyAccu_Accumulate(&acc, zero) ||
13662 _PyAccu_Accumulate(&acc, second))
13663 goto onError;
13664 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013665 }
13666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013667 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013668 if (temp != NULL) {
13669 assert(pbuf == PyUnicode_DATA(temp));
13670 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013671 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013672 else {
13673 const char *p = (const char *) pbuf;
13674 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013675 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013676 v = PyUnicode_FromKindAndData(kind, p, len);
13677 }
13678 if (v == NULL)
13679 goto onError;
13680 r = _PyAccu_Accumulate(&acc, v);
13681 Py_DECREF(v);
13682 if (r)
13683 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013684 if (width > len && repeat_accumulate(&acc, blank, width - len))
13685 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013686 if (dict && (argidx < arglen) && c != '%') {
13687 PyErr_SetString(PyExc_TypeError,
13688 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 goto onError;
13690 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013691 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013692 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013693 } /* until end */
13694 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 PyErr_SetString(PyExc_TypeError,
13696 "not all arguments converted during string formatting");
13697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698 }
13699
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013700 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013702 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013703 }
13704 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013705 Py_XDECREF(temp);
13706 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013707 return (PyObject *)result;
13708
Benjamin Peterson29060642009-01-31 22:14:21 +000013709 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013711 Py_XDECREF(temp);
13712 Py_XDECREF(second);
13713 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013714 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013715 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013716 }
13717 return NULL;
13718}
13719
Jeremy Hylton938ace62002-07-17 16:30:39 +000013720static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013721unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13722
Tim Peters6d6c1a32001-08-02 04:15:00 +000013723static PyObject *
13724unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13725{
Benjamin Peterson29060642009-01-31 22:14:21 +000013726 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013727 static char *kwlist[] = {"object", "encoding", "errors", 0};
13728 char *encoding = NULL;
13729 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013730
Benjamin Peterson14339b62009-01-31 16:36:08 +000013731 if (type != &PyUnicode_Type)
13732 return unicode_subtype_new(type, args, kwds);
13733 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013734 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013735 return NULL;
13736 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013737 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013738 if (encoding == NULL && errors == NULL)
13739 return PyObject_Str(x);
13740 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013742}
13743
Guido van Rossume023fe02001-08-30 03:12:59 +000013744static PyObject *
13745unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13746{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013747 PyUnicodeObject *unicode, *self;
13748 Py_ssize_t length, char_size;
13749 int share_wstr, share_utf8;
13750 unsigned int kind;
13751 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013752
Benjamin Peterson14339b62009-01-31 16:36:08 +000013753 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013754
13755 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13756 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013757 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013758 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013759 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013760 return NULL;
13761
13762 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13763 if (self == NULL) {
13764 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013765 return NULL;
13766 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013767 kind = PyUnicode_KIND(unicode);
13768 length = PyUnicode_GET_LENGTH(unicode);
13769
13770 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013771#ifdef Py_DEBUG
13772 _PyUnicode_HASH(self) = -1;
13773#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013774 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013775#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013776 _PyUnicode_STATE(self).interned = 0;
13777 _PyUnicode_STATE(self).kind = kind;
13778 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013779 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013780 _PyUnicode_STATE(self).ready = 1;
13781 _PyUnicode_WSTR(self) = NULL;
13782 _PyUnicode_UTF8_LENGTH(self) = 0;
13783 _PyUnicode_UTF8(self) = NULL;
13784 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013785 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013786
13787 share_utf8 = 0;
13788 share_wstr = 0;
13789 if (kind == PyUnicode_1BYTE_KIND) {
13790 char_size = 1;
13791 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13792 share_utf8 = 1;
13793 }
13794 else if (kind == PyUnicode_2BYTE_KIND) {
13795 char_size = 2;
13796 if (sizeof(wchar_t) == 2)
13797 share_wstr = 1;
13798 }
13799 else {
13800 assert(kind == PyUnicode_4BYTE_KIND);
13801 char_size = 4;
13802 if (sizeof(wchar_t) == 4)
13803 share_wstr = 1;
13804 }
13805
13806 /* Ensure we won't overflow the length. */
13807 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13808 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013809 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013810 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013811 data = PyObject_MALLOC((length + 1) * char_size);
13812 if (data == NULL) {
13813 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013814 goto onError;
13815 }
13816
Victor Stinnerc3c74152011-10-02 20:39:55 +020013817 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013818 if (share_utf8) {
13819 _PyUnicode_UTF8_LENGTH(self) = length;
13820 _PyUnicode_UTF8(self) = data;
13821 }
13822 if (share_wstr) {
13823 _PyUnicode_WSTR_LENGTH(self) = length;
13824 _PyUnicode_WSTR(self) = (wchar_t *)data;
13825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013826
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013827 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013828 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013829 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013830 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013831#ifdef Py_DEBUG
13832 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13833#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013834 return (PyObject *)self;
13835
13836onError:
13837 Py_DECREF(unicode);
13838 Py_DECREF(self);
13839 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013840}
13841
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013842PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013843 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013844\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013845Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013846encoding defaults to the current default string encoding.\n\
13847errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013848
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013849static PyObject *unicode_iter(PyObject *seq);
13850
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013852 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013853 "str", /* tp_name */
13854 sizeof(PyUnicodeObject), /* tp_size */
13855 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013857 (destructor)unicode_dealloc, /* tp_dealloc */
13858 0, /* tp_print */
13859 0, /* tp_getattr */
13860 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013861 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013862 unicode_repr, /* tp_repr */
13863 &unicode_as_number, /* tp_as_number */
13864 &unicode_as_sequence, /* tp_as_sequence */
13865 &unicode_as_mapping, /* tp_as_mapping */
13866 (hashfunc) unicode_hash, /* tp_hash*/
13867 0, /* tp_call*/
13868 (reprfunc) unicode_str, /* tp_str */
13869 PyObject_GenericGetAttr, /* tp_getattro */
13870 0, /* tp_setattro */
13871 0, /* tp_as_buffer */
13872 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013873 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 unicode_doc, /* tp_doc */
13875 0, /* tp_traverse */
13876 0, /* tp_clear */
13877 PyUnicode_RichCompare, /* tp_richcompare */
13878 0, /* tp_weaklistoffset */
13879 unicode_iter, /* tp_iter */
13880 0, /* tp_iternext */
13881 unicode_methods, /* tp_methods */
13882 0, /* tp_members */
13883 0, /* tp_getset */
13884 &PyBaseObject_Type, /* tp_base */
13885 0, /* tp_dict */
13886 0, /* tp_descr_get */
13887 0, /* tp_descr_set */
13888 0, /* tp_dictoffset */
13889 0, /* tp_init */
13890 0, /* tp_alloc */
13891 unicode_new, /* tp_new */
13892 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013893};
13894
13895/* Initialize the Unicode implementation */
13896
Victor Stinner3a50e702011-10-18 21:21:00 +020013897int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013898{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013899 int i;
13900
Thomas Wouters477c8d52006-05-27 19:21:47 +000013901 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013902 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013903 0x000A, /* LINE FEED */
13904 0x000D, /* CARRIAGE RETURN */
13905 0x001C, /* FILE SEPARATOR */
13906 0x001D, /* GROUP SEPARATOR */
13907 0x001E, /* RECORD SEPARATOR */
13908 0x0085, /* NEXT LINE */
13909 0x2028, /* LINE SEPARATOR */
13910 0x2029, /* PARAGRAPH SEPARATOR */
13911 };
13912
Fred Drakee4315f52000-05-09 19:53:39 +000013913 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013914 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013915 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013916 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013917 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013918
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013919 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013920 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013921 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013923
13924 /* initialize the linebreak bloom filter */
13925 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013926 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013927 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013928
13929 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013930
13931#ifdef HAVE_MBCS
13932 winver.dwOSVersionInfoSize = sizeof(winver);
13933 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13934 PyErr_SetFromWindowsErr(0);
13935 return -1;
13936 }
13937#endif
13938 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013939}
13940
13941/* Finalize the Unicode implementation */
13942
Christian Heimesa156e092008-02-16 07:38:31 +000013943int
13944PyUnicode_ClearFreeList(void)
13945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013946 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013947}
13948
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949void
Thomas Wouters78890102000-07-22 19:25:51 +000013950_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013952 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013954 Py_XDECREF(unicode_empty);
13955 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013956
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013957 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013958 if (unicode_latin1[i]) {
13959 Py_DECREF(unicode_latin1[i]);
13960 unicode_latin1[i] = NULL;
13961 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013962 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013963 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013964 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013965}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013966
Walter Dörwald16807132007-05-25 13:52:07 +000013967void
13968PyUnicode_InternInPlace(PyObject **p)
13969{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13971 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013972#ifdef Py_DEBUG
13973 assert(s != NULL);
13974 assert(_PyUnicode_CHECK(s));
13975#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013976 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013977 return;
13978#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 /* If it's a subclass, we don't really know what putting
13980 it in the interned dict might do. */
13981 if (!PyUnicode_CheckExact(s))
13982 return;
13983 if (PyUnicode_CHECK_INTERNED(s))
13984 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013985 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013986 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013987 return;
13988 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013989 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 if (interned == NULL) {
13991 interned = PyDict_New();
13992 if (interned == NULL) {
13993 PyErr_Clear(); /* Don't leave an exception */
13994 return;
13995 }
13996 }
13997 /* It might be that the GetItem call fails even
13998 though the key is present in the dictionary,
13999 namely when this happens during a stack overflow. */
14000 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000014001 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014002 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014003
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 if (t) {
14005 Py_INCREF(t);
14006 Py_DECREF(*p);
14007 *p = t;
14008 return;
14009 }
Walter Dörwald16807132007-05-25 13:52:07 +000014010
Benjamin Peterson14339b62009-01-31 16:36:08 +000014011 PyThreadState_GET()->recursion_critical = 1;
14012 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
14013 PyErr_Clear();
14014 PyThreadState_GET()->recursion_critical = 0;
14015 return;
14016 }
14017 PyThreadState_GET()->recursion_critical = 0;
14018 /* The two references in interned are not counted by refcnt.
14019 The deallocator will take care of this */
14020 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014021 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014022}
14023
14024void
14025PyUnicode_InternImmortal(PyObject **p)
14026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014027 PyUnicodeObject *u = (PyUnicodeObject *)*p;
14028
Benjamin Peterson14339b62009-01-31 16:36:08 +000014029 PyUnicode_InternInPlace(p);
14030 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014031 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 Py_INCREF(*p);
14033 }
Walter Dörwald16807132007-05-25 13:52:07 +000014034}
14035
14036PyObject *
14037PyUnicode_InternFromString(const char *cp)
14038{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014039 PyObject *s = PyUnicode_FromString(cp);
14040 if (s == NULL)
14041 return NULL;
14042 PyUnicode_InternInPlace(&s);
14043 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014044}
14045
Alexander Belopolsky40018472011-02-26 01:02:56 +000014046void
14047_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014049 PyObject *keys;
14050 PyUnicodeObject *s;
14051 Py_ssize_t i, n;
14052 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014053
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 if (interned == NULL || !PyDict_Check(interned))
14055 return;
14056 keys = PyDict_Keys(interned);
14057 if (keys == NULL || !PyList_Check(keys)) {
14058 PyErr_Clear();
14059 return;
14060 }
Walter Dörwald16807132007-05-25 13:52:07 +000014061
Benjamin Peterson14339b62009-01-31 16:36:08 +000014062 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14063 detector, interned unicode strings are not forcibly deallocated;
14064 rather, we give them their stolen references back, and then clear
14065 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014066
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 n = PyList_GET_SIZE(keys);
14068 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014069 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014070 for (i = 0; i < n; i++) {
14071 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014072 if (PyUnicode_READY(s) == -1) {
14073 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014074 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014076 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014077 case SSTATE_NOT_INTERNED:
14078 /* XXX Shouldn't happen */
14079 break;
14080 case SSTATE_INTERNED_IMMORTAL:
14081 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014082 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014083 break;
14084 case SSTATE_INTERNED_MORTAL:
14085 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014086 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014087 break;
14088 default:
14089 Py_FatalError("Inconsistent interned string state.");
14090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014091 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014092 }
14093 fprintf(stderr, "total size of all interned strings: "
14094 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14095 "mortal/immortal\n", mortal_size, immortal_size);
14096 Py_DECREF(keys);
14097 PyDict_Clear(interned);
14098 Py_DECREF(interned);
14099 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014100}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014101
14102
14103/********************* Unicode Iterator **************************/
14104
14105typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014106 PyObject_HEAD
14107 Py_ssize_t it_index;
14108 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014109} unicodeiterobject;
14110
14111static void
14112unicodeiter_dealloc(unicodeiterobject *it)
14113{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014114 _PyObject_GC_UNTRACK(it);
14115 Py_XDECREF(it->it_seq);
14116 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014117}
14118
14119static int
14120unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14121{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014122 Py_VISIT(it->it_seq);
14123 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014124}
14125
14126static PyObject *
14127unicodeiter_next(unicodeiterobject *it)
14128{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014129 PyUnicodeObject *seq;
14130 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014131
Benjamin Peterson14339b62009-01-31 16:36:08 +000014132 assert(it != NULL);
14133 seq = it->it_seq;
14134 if (seq == NULL)
14135 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014136 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014138 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14139 int kind = PyUnicode_KIND(seq);
14140 void *data = PyUnicode_DATA(seq);
14141 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14142 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014143 if (item != NULL)
14144 ++it->it_index;
14145 return item;
14146 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014147
Benjamin Peterson14339b62009-01-31 16:36:08 +000014148 Py_DECREF(seq);
14149 it->it_seq = NULL;
14150 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014151}
14152
14153static PyObject *
14154unicodeiter_len(unicodeiterobject *it)
14155{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014156 Py_ssize_t len = 0;
14157 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014158 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014160}
14161
14162PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14163
14164static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014165 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014166 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014168};
14169
14170PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014171 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14172 "str_iterator", /* tp_name */
14173 sizeof(unicodeiterobject), /* tp_basicsize */
14174 0, /* tp_itemsize */
14175 /* methods */
14176 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14177 0, /* tp_print */
14178 0, /* tp_getattr */
14179 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014180 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 0, /* tp_repr */
14182 0, /* tp_as_number */
14183 0, /* tp_as_sequence */
14184 0, /* tp_as_mapping */
14185 0, /* tp_hash */
14186 0, /* tp_call */
14187 0, /* tp_str */
14188 PyObject_GenericGetAttr, /* tp_getattro */
14189 0, /* tp_setattro */
14190 0, /* tp_as_buffer */
14191 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14192 0, /* tp_doc */
14193 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14194 0, /* tp_clear */
14195 0, /* tp_richcompare */
14196 0, /* tp_weaklistoffset */
14197 PyObject_SelfIter, /* tp_iter */
14198 (iternextfunc)unicodeiter_next, /* tp_iternext */
14199 unicodeiter_methods, /* tp_methods */
14200 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014201};
14202
14203static PyObject *
14204unicode_iter(PyObject *seq)
14205{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014206 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014207
Benjamin Peterson14339b62009-01-31 16:36:08 +000014208 if (!PyUnicode_Check(seq)) {
14209 PyErr_BadInternalCall();
14210 return NULL;
14211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014212 if (PyUnicode_READY(seq) == -1)
14213 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014214 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14215 if (it == NULL)
14216 return NULL;
14217 it->it_index = 0;
14218 Py_INCREF(seq);
14219 it->it_seq = (PyUnicodeObject *)seq;
14220 _PyObject_GC_TRACK(it);
14221 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014222}
14223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014224#define UNIOP(x) Py_UNICODE_##x
14225#define UNIOP_t Py_UNICODE
14226#include "uniops.h"
14227#undef UNIOP
14228#undef UNIOP_t
14229#define UNIOP(x) Py_UCS4_##x
14230#define UNIOP_t Py_UCS4
14231#include "uniops.h"
14232#undef UNIOP
14233#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000014234
Victor Stinner71133ff2010-09-01 23:43:53 +000014235Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000014236PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000014237{
14238 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020014239 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000014240 Py_ssize_t size;
14241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014242 if (!PyUnicode_Check(unicode)) {
14243 PyErr_BadArgument();
14244 return NULL;
14245 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014246 u = PyUnicode_AsUnicode(object);
14247 if (u == NULL)
14248 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014249 /* Ensure we won't overflow the size. */
14250 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14251 PyErr_NoMemory();
14252 return NULL;
14253 }
14254 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
14255 size *= sizeof(Py_UNICODE);
14256 copy = PyMem_Malloc(size);
14257 if (copy == NULL) {
14258 PyErr_NoMemory();
14259 return NULL;
14260 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014261 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014262 return copy;
14263}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014264
Georg Brandl66c221e2010-10-14 07:04:07 +000014265/* A _string module, to export formatter_parser and formatter_field_name_split
14266 to the string.Formatter class implemented in Python. */
14267
14268static PyMethodDef _string_methods[] = {
14269 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14270 METH_O, PyDoc_STR("split the argument as a field name")},
14271 {"formatter_parser", (PyCFunction) formatter_parser,
14272 METH_O, PyDoc_STR("parse the argument as a format string")},
14273 {NULL, NULL}
14274};
14275
14276static struct PyModuleDef _string_module = {
14277 PyModuleDef_HEAD_INIT,
14278 "_string",
14279 PyDoc_STR("string helper module"),
14280 0,
14281 _string_methods,
14282 NULL,
14283 NULL,
14284 NULL,
14285 NULL
14286};
14287
14288PyMODINIT_FUNC
14289PyInit__string(void)
14290{
14291 return PyModule_Create(&_string_module);
14292}
14293
14294
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014295#ifdef __cplusplus
14296}
14297#endif