blob: 9d11546cb339a56e120be81b60ce9ecf4a368d13 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Victor Stinner3a50e702011-10-18 21:21:00 +0200432#ifdef HAVE_MBCS
433static OSVERSIONINFOEX winver;
434#endif
435
Thomas Wouters477c8d52006-05-27 19:21:47 +0000436/* --- Bloom Filters ----------------------------------------------------- */
437
438/* stuff to implement simple "bloom filters" for Unicode characters.
439 to keep things simple, we use a single bitmask, using the least 5
440 bits from each unicode characters as the bit index. */
441
442/* the linebreak mask is set up by Unicode_Init below */
443
Antoine Pitrouf068f942010-01-13 14:19:12 +0000444#if LONG_BIT >= 128
445#define BLOOM_WIDTH 128
446#elif LONG_BIT >= 64
447#define BLOOM_WIDTH 64
448#elif LONG_BIT >= 32
449#define BLOOM_WIDTH 32
450#else
451#error "LONG_BIT is smaller than 32"
452#endif
453
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454#define BLOOM_MASK unsigned long
455
456static BLOOM_MASK bloom_linebreak;
457
Antoine Pitrouf068f942010-01-13 14:19:12 +0000458#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
459#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Benjamin Peterson29060642009-01-31 22:14:21 +0000461#define BLOOM_LINEBREAK(ch) \
462 ((ch) < 128U ? ascii_linebreak[(ch)] : \
463 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000464
Alexander Belopolsky40018472011-02-26 01:02:56 +0000465Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467{
468 /* calculate simple bloom-style bitmask for a given unicode string */
469
Antoine Pitrouf068f942010-01-13 14:19:12 +0000470 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000471 Py_ssize_t i;
472
473 mask = 0;
474 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000476
477 return mask;
478}
479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480#define BLOOM_MEMBER(mask, chr, str) \
481 (BLOOM(mask, chr) \
482 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000483
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200484/* Compilation of templated routines */
485
486#include "stringlib/asciilib.h"
487#include "stringlib/fastsearch.h"
488#include "stringlib/partition.h"
489#include "stringlib/split.h"
490#include "stringlib/count.h"
491#include "stringlib/find.h"
492#include "stringlib/find_max_char.h"
493#include "stringlib/localeutil.h"
494#include "stringlib/undef.h"
495
496#include "stringlib/ucs1lib.h"
497#include "stringlib/fastsearch.h"
498#include "stringlib/partition.h"
499#include "stringlib/split.h"
500#include "stringlib/count.h"
501#include "stringlib/find.h"
502#include "stringlib/find_max_char.h"
503#include "stringlib/localeutil.h"
504#include "stringlib/undef.h"
505
506#include "stringlib/ucs2lib.h"
507#include "stringlib/fastsearch.h"
508#include "stringlib/partition.h"
509#include "stringlib/split.h"
510#include "stringlib/count.h"
511#include "stringlib/find.h"
512#include "stringlib/find_max_char.h"
513#include "stringlib/localeutil.h"
514#include "stringlib/undef.h"
515
516#include "stringlib/ucs4lib.h"
517#include "stringlib/fastsearch.h"
518#include "stringlib/partition.h"
519#include "stringlib/split.h"
520#include "stringlib/count.h"
521#include "stringlib/find.h"
522#include "stringlib/find_max_char.h"
523#include "stringlib/localeutil.h"
524#include "stringlib/undef.h"
525
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200526#include "stringlib/unicodedefs.h"
527#include "stringlib/fastsearch.h"
528#include "stringlib/count.h"
529#include "stringlib/find.h"
530
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531/* --- Unicode Object ----------------------------------------------------- */
532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200534fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
537 Py_ssize_t size, Py_UCS4 ch,
538 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200540 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
541
542 switch (kind) {
543 case PyUnicode_1BYTE_KIND:
544 {
545 Py_UCS1 ch1 = (Py_UCS1) ch;
546 if (ch1 == ch)
547 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
548 else
549 return -1;
550 }
551 case PyUnicode_2BYTE_KIND:
552 {
553 Py_UCS2 ch2 = (Py_UCS2) ch;
554 if (ch2 == ch)
555 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
556 else
557 return -1;
558 }
559 case PyUnicode_4BYTE_KIND:
560 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
561 default:
562 assert(0);
563 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565}
566
Victor Stinnerfe226c02011-10-03 03:52:20 +0200567static PyObject*
568resize_compact(PyObject *unicode, Py_ssize_t length)
569{
570 Py_ssize_t char_size;
571 Py_ssize_t struct_size;
572 Py_ssize_t new_size;
573 int share_wstr;
574
575 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200576 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 if (PyUnicode_IS_COMPACT_ASCII(unicode))
578 struct_size = sizeof(PyASCIIObject);
579 else
580 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200581 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582
583 _Py_DEC_REFTOTAL;
584 _Py_ForgetReference(unicode);
585
586 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
587 PyErr_NoMemory();
588 return NULL;
589 }
590 new_size = (struct_size + (length + 1) * char_size);
591
592 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
593 if (unicode == NULL) {
594 PyObject_Del(unicode);
595 PyErr_NoMemory();
596 return NULL;
597 }
598 _Py_NewReference(unicode);
599 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200600 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200602 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
603 _PyUnicode_WSTR_LENGTH(unicode) = length;
604 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200605 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
606 length, 0);
607 return unicode;
608}
609
Alexander Belopolsky40018472011-02-26 01:02:56 +0000610static int
Victor Stinner95663112011-10-04 01:03:50 +0200611resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612{
Victor Stinner95663112011-10-04 01:03:50 +0200613 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000616
Victor Stinner95663112011-10-04 01:03:50 +0200617 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618
619 if (PyUnicode_IS_READY(unicode)) {
620 Py_ssize_t char_size;
621 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200622 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200623 void *data;
624
625 data = _PyUnicode_DATA_ANY(unicode);
626 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200627 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
629 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200630 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
631 {
632 PyObject_DEL(_PyUnicode_UTF8(unicode));
633 _PyUnicode_UTF8(unicode) = NULL;
634 _PyUnicode_UTF8_LENGTH(unicode) = 0;
635 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636
637 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
638 PyErr_NoMemory();
639 return -1;
640 }
641 new_size = (length + 1) * char_size;
642
643 data = (PyObject *)PyObject_REALLOC(data, new_size);
644 if (data == NULL) {
645 PyErr_NoMemory();
646 return -1;
647 }
648 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200649 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200650 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_WSTR_LENGTH(unicode) = length;
652 }
653 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200654 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 _PyUnicode_UTF8_LENGTH(unicode) = length;
656 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 _PyUnicode_LENGTH(unicode) = length;
658 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200659 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200660 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 }
Victor Stinner95663112011-10-04 01:03:50 +0200664 assert(_PyUnicode_WSTR(unicode) != NULL);
665
666 /* check for integer overflow */
667 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
668 PyErr_NoMemory();
669 return -1;
670 }
671 wstr = _PyUnicode_WSTR(unicode);
672 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
673 if (!wstr) {
674 PyErr_NoMemory();
675 return -1;
676 }
677 _PyUnicode_WSTR(unicode) = wstr;
678 _PyUnicode_WSTR(unicode)[length] = 0;
679 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200680 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681 return 0;
682}
683
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684static PyObject*
685resize_copy(PyObject *unicode, Py_ssize_t length)
686{
687 Py_ssize_t copy_length;
688 if (PyUnicode_IS_COMPACT(unicode)) {
689 PyObject *copy;
690 assert(PyUnicode_IS_READY(unicode));
691
692 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
693 if (copy == NULL)
694 return NULL;
695
696 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200697 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200699 }
700 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200701 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(_PyUnicode_WSTR(unicode) != NULL);
703 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200704 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 if (w == NULL)
706 return NULL;
707 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
708 copy_length = Py_MIN(copy_length, length);
709 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
710 copy_length);
711 return (PyObject*)w;
712 }
713}
714
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000716 Ux0000 terminated; some code (e.g. new_identifier)
717 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000720 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721
722*/
723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200725static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#endif
727
Alexander Belopolsky40018472011-02-26 01:02:56 +0000728static PyUnicodeObject *
729_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730{
731 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000733
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 if (length == 0 && unicode_empty != NULL) {
736 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200737 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000738 }
739
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000740 /* Ensure we won't overflow the size. */
741 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
742 return (PyUnicodeObject *)PyErr_NoMemory();
743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 if (length < 0) {
745 PyErr_SetString(PyExc_SystemError,
746 "Negative size passed to _PyUnicode_New");
747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748 }
749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750#ifdef Py_DEBUG
751 ++unicode_old_new_calls;
752#endif
753
754 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
755 if (unicode == NULL)
756 return NULL;
757 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
758 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
759 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000760 PyErr_NoMemory();
761 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763
Jeremy Hyltond8082792003-09-16 19:41:39 +0000764 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000765 * the caller fails before initializing str -- unicode_resize()
766 * reads str[0], and the Keep-Alive optimization can keep memory
767 * allocated for str alive across a call to unicode_dealloc(unicode).
768 * We don't want unicode_resize to read uninitialized memory in
769 * that case.
770 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200771 _PyUnicode_WSTR(unicode)[0] = 0;
772 _PyUnicode_WSTR(unicode)[length] = 0;
773 _PyUnicode_WSTR_LENGTH(unicode) = length;
774 _PyUnicode_HASH(unicode) = -1;
775 _PyUnicode_STATE(unicode).interned = 0;
776 _PyUnicode_STATE(unicode).kind = 0;
777 _PyUnicode_STATE(unicode).compact = 0;
778 _PyUnicode_STATE(unicode).ready = 0;
779 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200780 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200781 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200782 _PyUnicode_UTF8(unicode) = NULL;
783 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000785
Benjamin Peterson29060642009-01-31 22:14:21 +0000786 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000787 /* XXX UNREF/NEWREF interface should be more symmetrical */
788 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000789 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000790 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000791 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792}
793
Victor Stinnerf42dc442011-10-02 23:33:16 +0200794static const char*
795unicode_kind_name(PyObject *unicode)
796{
Victor Stinner42dfd712011-10-03 14:41:45 +0200797 /* don't check consistency: unicode_kind_name() is called from
798 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200799 if (!PyUnicode_IS_COMPACT(unicode))
800 {
801 if (!PyUnicode_IS_READY(unicode))
802 return "wstr";
803 switch(PyUnicode_KIND(unicode))
804 {
805 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200806 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200807 return "legacy ascii";
808 else
809 return "legacy latin1";
810 case PyUnicode_2BYTE_KIND:
811 return "legacy UCS2";
812 case PyUnicode_4BYTE_KIND:
813 return "legacy UCS4";
814 default:
815 return "<legacy invalid kind>";
816 }
817 }
818 assert(PyUnicode_IS_READY(unicode));
819 switch(PyUnicode_KIND(unicode))
820 {
821 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200822 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200823 return "ascii";
824 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200827 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200828 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200829 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200830 default:
831 return "<invalid compact kind>";
832 }
833}
834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200835#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200836static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200837
838/* Functions wrapping macros for use in debugger */
839char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200840 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200841}
842
843void *_PyUnicode_compact_data(void *unicode) {
844 return _PyUnicode_COMPACT_DATA(unicode);
845}
846void *_PyUnicode_data(void *unicode){
847 printf("obj %p\n", unicode);
848 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
849 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
850 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
851 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
852 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
853 return PyUnicode_DATA(unicode);
854}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200855
856void
857_PyUnicode_Dump(PyObject *op)
858{
859 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200860 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
861 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
862 void *data;
863 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
864 if (ascii->state.compact)
865 data = (compact + 1);
866 else
867 data = unicode->data.any;
868 if (ascii->wstr == data)
869 printf("shared ");
870 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200871 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200872 printf(" (%zu), ", compact->wstr_length);
873 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
874 printf("shared ");
875 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200876 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200877 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200878}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200879#endif
880
881PyObject *
882PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
883{
884 PyObject *obj;
885 PyCompactUnicodeObject *unicode;
886 void *data;
887 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200888 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200889 Py_ssize_t char_size;
890 Py_ssize_t struct_size;
891
892 /* Optimization for empty strings */
893 if (size == 0 && unicode_empty != NULL) {
894 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200895 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200896 }
897
898#ifdef Py_DEBUG
899 ++unicode_new_new_calls;
900#endif
901
Victor Stinner9e9d6892011-10-04 01:02:02 +0200902 is_ascii = 0;
903 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200904 struct_size = sizeof(PyCompactUnicodeObject);
905 if (maxchar < 128) {
906 kind_state = PyUnicode_1BYTE_KIND;
907 char_size = 1;
908 is_ascii = 1;
909 struct_size = sizeof(PyASCIIObject);
910 }
911 else if (maxchar < 256) {
912 kind_state = PyUnicode_1BYTE_KIND;
913 char_size = 1;
914 }
915 else if (maxchar < 65536) {
916 kind_state = PyUnicode_2BYTE_KIND;
917 char_size = 2;
918 if (sizeof(wchar_t) == 2)
919 is_sharing = 1;
920 }
921 else {
922 kind_state = PyUnicode_4BYTE_KIND;
923 char_size = 4;
924 if (sizeof(wchar_t) == 4)
925 is_sharing = 1;
926 }
927
928 /* Ensure we won't overflow the size. */
929 if (size < 0) {
930 PyErr_SetString(PyExc_SystemError,
931 "Negative size passed to PyUnicode_New");
932 return NULL;
933 }
934 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
935 return PyErr_NoMemory();
936
937 /* Duplicated allocation code from _PyObject_New() instead of a call to
938 * PyObject_New() so we are able to allocate space for the object and
939 * it's data buffer.
940 */
941 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
942 if (obj == NULL)
943 return PyErr_NoMemory();
944 obj = PyObject_INIT(obj, &PyUnicode_Type);
945 if (obj == NULL)
946 return NULL;
947
948 unicode = (PyCompactUnicodeObject *)obj;
949 if (is_ascii)
950 data = ((PyASCIIObject*)obj) + 1;
951 else
952 data = unicode + 1;
953 _PyUnicode_LENGTH(unicode) = size;
954 _PyUnicode_HASH(unicode) = -1;
955 _PyUnicode_STATE(unicode).interned = 0;
956 _PyUnicode_STATE(unicode).kind = kind_state;
957 _PyUnicode_STATE(unicode).compact = 1;
958 _PyUnicode_STATE(unicode).ready = 1;
959 _PyUnicode_STATE(unicode).ascii = is_ascii;
960 if (is_ascii) {
961 ((char*)data)[size] = 0;
962 _PyUnicode_WSTR(unicode) = NULL;
963 }
964 else if (kind_state == PyUnicode_1BYTE_KIND) {
965 ((char*)data)[size] = 0;
966 _PyUnicode_WSTR(unicode) = NULL;
967 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 }
971 else {
972 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200973 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200974 if (kind_state == PyUnicode_2BYTE_KIND)
975 ((Py_UCS2*)data)[size] = 0;
976 else /* kind_state == PyUnicode_4BYTE_KIND */
977 ((Py_UCS4*)data)[size] = 0;
978 if (is_sharing) {
979 _PyUnicode_WSTR_LENGTH(unicode) = size;
980 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
981 }
982 else {
983 _PyUnicode_WSTR_LENGTH(unicode) = 0;
984 _PyUnicode_WSTR(unicode) = NULL;
985 }
986 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200987 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 return obj;
989}
990
991#if SIZEOF_WCHAR_T == 2
992/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
993 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200994 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995
996 This function assumes that unicode can hold one more code point than wstr
997 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200998static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200999unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1000 PyUnicodeObject *unicode)
1001{
1002 const wchar_t *iter;
1003 Py_UCS4 *ucs4_out;
1004
Victor Stinner910337b2011-10-03 03:20:16 +02001005 assert(unicode != NULL);
1006 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001007 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1008 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1009
1010 for (iter = begin; iter < end; ) {
1011 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1012 _PyUnicode_GET_LENGTH(unicode)));
1013 if (*iter >= 0xD800 && *iter <= 0xDBFF
1014 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1015 {
1016 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1017 iter += 2;
1018 }
1019 else {
1020 *ucs4_out++ = *iter;
1021 iter++;
1022 }
1023 }
1024 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1025 _PyUnicode_GET_LENGTH(unicode)));
1026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027}
1028#endif
1029
Victor Stinnercd9950f2011-10-02 00:34:53 +02001030static int
1031_PyUnicode_Dirty(PyObject *unicode)
1032{
Victor Stinner910337b2011-10-03 03:20:16 +02001033 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001034 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001035 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001036 "Cannot modify a string having more than 1 reference");
1037 return -1;
1038 }
1039 _PyUnicode_DIRTY(unicode);
1040 return 0;
1041}
1042
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001043static int
1044_copy_characters(PyObject *to, Py_ssize_t to_start,
1045 PyObject *from, Py_ssize_t from_start,
1046 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001048 unsigned int from_kind, to_kind;
1049 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001050 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001052 assert(PyUnicode_Check(from));
1053 assert(PyUnicode_Check(to));
1054 assert(PyUnicode_IS_READY(from));
1055 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001057 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1058 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1059 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001061 if (how_many == 0)
1062 return 0;
1063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001065 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001066 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001067 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001068
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001069#ifdef Py_DEBUG
1070 if (!check_maxchar
1071 && (from_kind > to_kind
1072 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001073 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001074 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1075 Py_UCS4 ch;
1076 Py_ssize_t i;
1077 for (i=0; i < how_many; i++) {
1078 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1079 assert(ch <= to_maxchar);
1080 }
1081 }
1082#endif
1083 fast = (from_kind == to_kind);
1084 if (check_maxchar
1085 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1086 {
1087 /* deny latin1 => ascii */
1088 fast = 0;
1089 }
1090
1091 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001092 Py_MEMCPY((char*)to_data + to_kind * to_start,
1093 (char*)from_data + from_kind * from_start,
1094 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001095 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001096 else if (from_kind == PyUnicode_1BYTE_KIND
1097 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001098 {
1099 _PyUnicode_CONVERT_BYTES(
1100 Py_UCS1, Py_UCS2,
1101 PyUnicode_1BYTE_DATA(from) + from_start,
1102 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1103 PyUnicode_2BYTE_DATA(to) + to_start
1104 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001105 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001106 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001107 && to_kind == PyUnicode_4BYTE_KIND)
1108 {
1109 _PyUnicode_CONVERT_BYTES(
1110 Py_UCS1, Py_UCS4,
1111 PyUnicode_1BYTE_DATA(from) + from_start,
1112 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1113 PyUnicode_4BYTE_DATA(to) + to_start
1114 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001115 }
1116 else if (from_kind == PyUnicode_2BYTE_KIND
1117 && to_kind == PyUnicode_4BYTE_KIND)
1118 {
1119 _PyUnicode_CONVERT_BYTES(
1120 Py_UCS2, Py_UCS4,
1121 PyUnicode_2BYTE_DATA(from) + from_start,
1122 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1123 PyUnicode_4BYTE_DATA(to) + to_start
1124 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001125 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001126 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001127 /* check if max_char(from substring) <= max_char(to) */
1128 if (from_kind > to_kind
1129 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001130 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001131 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001132 /* slow path to check for character overflow */
1133 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001134 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001135 Py_ssize_t i;
1136
Victor Stinner56c161a2011-10-06 02:47:11 +02001137#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 for (i=0; i < how_many; i++) {
1139 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001140 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001141 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1142 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001143#else
1144 if (!check_maxchar) {
1145 for (i=0; i < how_many; i++) {
1146 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1147 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1148 }
1149 }
1150 else {
1151 for (i=0; i < how_many; i++) {
1152 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1153 if (ch > to_maxchar)
1154 return 1;
1155 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1156 }
1157 }
1158#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001160 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001161 assert(0 && "inconsistent state");
1162 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001163 }
1164 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001165 return 0;
1166}
1167
1168static void
1169copy_characters(PyObject *to, Py_ssize_t to_start,
1170 PyObject *from, Py_ssize_t from_start,
1171 Py_ssize_t how_many)
1172{
1173 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1174}
1175
1176Py_ssize_t
1177PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1178 PyObject *from, Py_ssize_t from_start,
1179 Py_ssize_t how_many)
1180{
1181 int err;
1182
1183 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1184 PyErr_BadInternalCall();
1185 return -1;
1186 }
1187
1188 if (PyUnicode_READY(from))
1189 return -1;
1190 if (PyUnicode_READY(to))
1191 return -1;
1192
1193 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1194 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1195 PyErr_Format(PyExc_SystemError,
1196 "Cannot write %zi characters at %zi "
1197 "in a string of %zi characters",
1198 how_many, to_start, PyUnicode_GET_LENGTH(to));
1199 return -1;
1200 }
1201
1202 if (how_many == 0)
1203 return 0;
1204
1205 if (_PyUnicode_Dirty(to))
1206 return -1;
1207
1208 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1209 if (err) {
1210 PyErr_Format(PyExc_SystemError,
1211 "Cannot copy %s characters "
1212 "into a string of %s characters",
1213 unicode_kind_name(from),
1214 unicode_kind_name(to));
1215 return -1;
1216 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001217 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218}
1219
Victor Stinner17222162011-09-28 22:15:37 +02001220/* Find the maximum code point and count the number of surrogate pairs so a
1221 correct string length can be computed before converting a string to UCS4.
1222 This function counts single surrogates as a character and not as a pair.
1223
1224 Return 0 on success, or -1 on error. */
1225static int
1226find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1227 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228{
1229 const wchar_t *iter;
1230
Victor Stinnerc53be962011-10-02 21:33:54 +02001231 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001232 *num_surrogates = 0;
1233 *maxchar = 0;
1234
1235 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001236 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001238#if SIZEOF_WCHAR_T != 2
1239 if (*maxchar >= 0x10000)
1240 return 0;
1241#endif
1242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243#if SIZEOF_WCHAR_T == 2
1244 if (*iter >= 0xD800 && *iter <= 0xDBFF
1245 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1246 {
1247 Py_UCS4 surrogate_val;
1248 surrogate_val = (((iter[0] & 0x3FF)<<10)
1249 | (iter[1] & 0x3FF)) + 0x10000;
1250 ++(*num_surrogates);
1251 if (surrogate_val > *maxchar)
1252 *maxchar = surrogate_val;
1253 iter += 2;
1254 }
1255 else
1256 iter++;
1257#else
1258 iter++;
1259#endif
1260 }
1261 return 0;
1262}
1263
1264#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001265static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266#endif
1267
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001268static int
1269unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001270{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001271 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 wchar_t *end;
1273 Py_UCS4 maxchar = 0;
1274 Py_ssize_t num_surrogates;
1275#if SIZEOF_WCHAR_T == 2
1276 Py_ssize_t length_wo_surrogates;
1277#endif
1278
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001279 assert(p_obj != NULL);
1280 unicode = (PyUnicodeObject *)*p_obj;
1281
Georg Brandl7597add2011-10-05 16:36:47 +02001282 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001283 strings were created using _PyObject_New() and where no canonical
1284 representation (the str field) has been set yet aka strings
1285 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001286 assert(_PyUnicode_CHECK(unicode));
1287 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001288 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001289 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001290 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001291 /* Actually, it should neither be interned nor be anything else: */
1292 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293
1294#ifdef Py_DEBUG
1295 ++unicode_ready_calls;
1296#endif
1297
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001298#ifdef Py_DEBUG
1299 assert(!replace || Py_REFCNT(unicode) == 1);
1300#else
1301 if (replace && Py_REFCNT(unicode) != 1)
1302 replace = 0;
1303#endif
1304 if (replace) {
1305 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1306 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1307 /* Optimization for empty strings */
1308 if (len == 0) {
1309 Py_INCREF(unicode_empty);
1310 Py_DECREF(*p_obj);
1311 *p_obj = unicode_empty;
1312 return 0;
1313 }
1314 if (len == 1 && wstr[0] < 256) {
1315 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1316 if (latin1_char == NULL)
1317 return -1;
1318 Py_DECREF(*p_obj);
1319 *p_obj = latin1_char;
1320 return 0;
1321 }
1322 }
1323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001325 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001326 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001327 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328
1329 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001330 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1331 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 PyErr_NoMemory();
1333 return -1;
1334 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001335 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 _PyUnicode_WSTR(unicode), end,
1337 PyUnicode_1BYTE_DATA(unicode));
1338 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1339 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1340 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1341 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001342 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001343 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001344 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001345 }
1346 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001347 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001348 _PyUnicode_UTF8(unicode) = NULL;
1349 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001350 }
1351 PyObject_FREE(_PyUnicode_WSTR(unicode));
1352 _PyUnicode_WSTR(unicode) = NULL;
1353 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1354 }
1355 /* In this case we might have to convert down from 4-byte native
1356 wchar_t to 2-byte unicode. */
1357 else if (maxchar < 65536) {
1358 assert(num_surrogates == 0 &&
1359 "FindMaxCharAndNumSurrogatePairs() messed up");
1360
Victor Stinner506f5922011-09-28 22:34:18 +02001361#if SIZEOF_WCHAR_T == 2
1362 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001363 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001364 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1365 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1366 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001367 _PyUnicode_UTF8(unicode) = NULL;
1368 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001369#else
1370 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001371 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001372 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001373 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001374 PyErr_NoMemory();
1375 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 }
Victor Stinner506f5922011-09-28 22:34:18 +02001377 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1378 _PyUnicode_WSTR(unicode), end,
1379 PyUnicode_2BYTE_DATA(unicode));
1380 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1381 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1382 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001383 _PyUnicode_UTF8(unicode) = NULL;
1384 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001385 PyObject_FREE(_PyUnicode_WSTR(unicode));
1386 _PyUnicode_WSTR(unicode) = NULL;
1387 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1388#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 }
1390 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1391 else {
1392#if SIZEOF_WCHAR_T == 2
1393 /* in case the native representation is 2-bytes, we need to allocate a
1394 new normalized 4-byte version. */
1395 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001396 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1397 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001398 PyErr_NoMemory();
1399 return -1;
1400 }
1401 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1402 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001403 _PyUnicode_UTF8(unicode) = NULL;
1404 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001405 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1406 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001407 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 PyObject_FREE(_PyUnicode_WSTR(unicode));
1409 _PyUnicode_WSTR(unicode) = NULL;
1410 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1411#else
1412 assert(num_surrogates == 0);
1413
Victor Stinnerc3c74152011-10-02 20:39:55 +02001414 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001416 _PyUnicode_UTF8(unicode) = NULL;
1417 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1419#endif
1420 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1421 }
1422 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001423 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 return 0;
1425}
1426
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001427int
1428_PyUnicode_ReadyReplace(PyObject **op)
1429{
1430 return unicode_ready(op, 1);
1431}
1432
1433int
1434_PyUnicode_Ready(PyObject *op)
1435{
1436 return unicode_ready(&op, 0);
1437}
1438
Alexander Belopolsky40018472011-02-26 01:02:56 +00001439static void
1440unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
Walter Dörwald16807132007-05-25 13:52:07 +00001442 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001443 case SSTATE_NOT_INTERNED:
1444 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001445
Benjamin Peterson29060642009-01-31 22:14:21 +00001446 case SSTATE_INTERNED_MORTAL:
1447 /* revive dead object temporarily for DelItem */
1448 Py_REFCNT(unicode) = 3;
1449 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1450 Py_FatalError(
1451 "deletion of interned string failed");
1452 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001453
Benjamin Peterson29060642009-01-31 22:14:21 +00001454 case SSTATE_INTERNED_IMMORTAL:
1455 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001456
Benjamin Peterson29060642009-01-31 22:14:21 +00001457 default:
1458 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001459 }
1460
Victor Stinner03490912011-10-03 23:45:12 +02001461 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001463 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001464 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465
1466 if (PyUnicode_IS_COMPACT(unicode)) {
1467 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468 }
1469 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 if (_PyUnicode_DATA_ANY(unicode))
1471 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001472 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 }
1474}
1475
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001476#ifdef Py_DEBUG
1477static int
1478unicode_is_singleton(PyObject *unicode)
1479{
1480 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1481 if (unicode == unicode_empty)
1482 return 1;
1483 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1484 {
1485 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1486 if (ch < 256 && unicode_latin1[ch] == unicode)
1487 return 1;
1488 }
1489 return 0;
1490}
1491#endif
1492
Alexander Belopolsky40018472011-02-26 01:02:56 +00001493static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001494unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001495{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001496 if (Py_REFCNT(unicode) != 1)
1497 return 0;
1498 if (PyUnicode_CHECK_INTERNED(unicode))
1499 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001500#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001501 /* singleton refcount is greater than 1 */
1502 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001503#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001504 return 1;
1505}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001506
Victor Stinnerfe226c02011-10-03 03:52:20 +02001507static int
1508unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1509{
1510 PyObject *unicode;
1511 Py_ssize_t old_length;
1512
1513 assert(p_unicode != NULL);
1514 unicode = *p_unicode;
1515
1516 assert(unicode != NULL);
1517 assert(PyUnicode_Check(unicode));
1518 assert(0 <= length);
1519
Victor Stinner910337b2011-10-03 03:20:16 +02001520 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001521 old_length = PyUnicode_WSTR_LENGTH(unicode);
1522 else
1523 old_length = PyUnicode_GET_LENGTH(unicode);
1524 if (old_length == length)
1525 return 0;
1526
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 if (!unicode_resizable(unicode)) {
1528 PyObject *copy = resize_copy(unicode, length);
1529 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001530 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001531 Py_DECREF(*p_unicode);
1532 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001533 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001534 }
1535
Victor Stinnerfe226c02011-10-03 03:52:20 +02001536 if (PyUnicode_IS_COMPACT(unicode)) {
1537 *p_unicode = resize_compact(unicode, length);
1538 if (*p_unicode == NULL)
1539 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001540 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001541 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001542 }
1543 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001544}
1545
Alexander Belopolsky40018472011-02-26 01:02:56 +00001546int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001547PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001548{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001549 PyObject *unicode;
1550 if (p_unicode == NULL) {
1551 PyErr_BadInternalCall();
1552 return -1;
1553 }
1554 unicode = *p_unicode;
1555 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1556 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1557 {
1558 PyErr_BadInternalCall();
1559 return -1;
1560 }
1561 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001562}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001563
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564static PyObject*
1565get_latin1_char(unsigned char ch)
1566{
Victor Stinnera464fc12011-10-02 20:39:30 +02001567 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001569 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 if (!unicode)
1571 return NULL;
1572 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001573 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 unicode_latin1[ch] = unicode;
1575 }
1576 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001577 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578}
1579
Alexander Belopolsky40018472011-02-26 01:02:56 +00001580PyObject *
1581PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582{
1583 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 Py_UCS4 maxchar = 0;
1585 Py_ssize_t num_surrogates;
1586
1587 if (u == NULL)
1588 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001590 /* If the Unicode data is known at construction time, we can apply
1591 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001593 /* Optimization for empty strings */
1594 if (size == 0 && unicode_empty != NULL) {
1595 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001596 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001597 }
Tim Petersced69f82003-09-16 20:30:58 +00001598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 /* Single character Unicode objects in the Latin-1 range are
1600 shared when using this constructor */
1601 if (size == 1 && *u < 256)
1602 return get_latin1_char((unsigned char)*u);
1603
1604 /* If not empty and not single character, copy the Unicode data
1605 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001606 if (find_maxchar_surrogates(u, u + size,
1607 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001608 return NULL;
1609
1610 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1611 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612 if (!unicode)
1613 return NULL;
1614
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001615 switch (PyUnicode_KIND(unicode)) {
1616 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001617 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001618 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1619 break;
1620 case PyUnicode_2BYTE_KIND:
1621#if Py_UNICODE_SIZE == 2
1622 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1623#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001624 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001625 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1626#endif
1627 break;
1628 case PyUnicode_4BYTE_KIND:
1629#if SIZEOF_WCHAR_T == 2
1630 /* This is the only case which has to process surrogates, thus
1631 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001632 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001633#else
1634 assert(num_surrogates == 0);
1635 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1636#endif
1637 break;
1638 default:
1639 assert(0 && "Impossible state");
1640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001642 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 return (PyObject *)unicode;
1644}
1645
Alexander Belopolsky40018472011-02-26 01:02:56 +00001646PyObject *
1647PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001648{
1649 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001650
Benjamin Peterson14339b62009-01-31 16:36:08 +00001651 if (size < 0) {
1652 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001653 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001654 return NULL;
1655 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001656
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001657 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001658 some optimizations which share commonly used objects.
1659 Also, this means the input must be UTF-8, so fall back to the
1660 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001661 if (u != NULL) {
1662
Benjamin Peterson29060642009-01-31 22:14:21 +00001663 /* Optimization for empty strings */
1664 if (size == 0 && unicode_empty != NULL) {
1665 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001666 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001667 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001668
1669 /* Single characters are shared when using this constructor.
1670 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001671 if (size == 1 && Py_CHARMASK(*u) < 128)
1672 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001673
1674 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001675 }
1676
Walter Dörwald55507312007-05-18 13:12:10 +00001677 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001678 if (!unicode)
1679 return NULL;
1680
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001681 return (PyObject *)unicode;
1682}
1683
Alexander Belopolsky40018472011-02-26 01:02:56 +00001684PyObject *
1685PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001686{
1687 size_t size = strlen(u);
1688 if (size > PY_SSIZE_T_MAX) {
1689 PyErr_SetString(PyExc_OverflowError, "input too long");
1690 return NULL;
1691 }
1692
1693 return PyUnicode_FromStringAndSize(u, size);
1694}
1695
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001696PyObject *
1697_PyUnicode_FromId(_Py_Identifier *id)
1698{
1699 if (!id->object) {
1700 id->object = PyUnicode_FromString(id->string);
1701 if (!id->object)
1702 return NULL;
1703 PyUnicode_InternInPlace(&id->object);
1704 assert(!id->next);
1705 id->next = static_strings;
1706 static_strings = id;
1707 }
1708 Py_INCREF(id->object);
1709 return id->object;
1710}
1711
1712void
1713_PyUnicode_ClearStaticStrings()
1714{
1715 _Py_Identifier *i;
1716 for (i = static_strings; i; i = i->next) {
1717 Py_DECREF(i->object);
1718 i->object = NULL;
1719 i->next = NULL;
1720 }
1721}
1722
Victor Stinnere57b1c02011-09-28 22:20:48 +02001723static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001724unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001725{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001726 PyObject *res;
1727#ifdef Py_DEBUG
1728 const unsigned char *p;
1729 const unsigned char *end = s + size;
1730 for (p=s; p < end; p++) {
1731 assert(*p < 128);
1732 }
1733#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001734 if (size == 1)
1735 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001736 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001737 if (!res)
1738 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001739 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001740 return res;
1741}
1742
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001743static Py_UCS4
1744kind_maxchar_limit(unsigned int kind)
1745{
1746 switch(kind) {
1747 case PyUnicode_1BYTE_KIND:
1748 return 0x80;
1749 case PyUnicode_2BYTE_KIND:
1750 return 0x100;
1751 case PyUnicode_4BYTE_KIND:
1752 return 0x10000;
1753 default:
1754 assert(0 && "invalid kind");
1755 return 0x10ffff;
1756 }
1757}
1758
Victor Stinner702c7342011-10-05 13:50:52 +02001759static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001760_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001761{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001763 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001764
1765 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001766 if (size == 1)
1767 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001768 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001769 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 if (!res)
1771 return NULL;
1772 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001773 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001774 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001775}
1776
Victor Stinnere57b1c02011-09-28 22:20:48 +02001777static PyObject*
1778_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779{
1780 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001781 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001782
1783 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001784 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001785 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001786 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001787 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001788 if (!res)
1789 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001790 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001792 else {
1793 _PyUnicode_CONVERT_BYTES(
1794 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1795 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001796 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 return res;
1798}
1799
Victor Stinnere57b1c02011-09-28 22:20:48 +02001800static PyObject*
1801_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802{
1803 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001804 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001805
1806 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001807 if (size == 1 && u[0] < 256)
1808 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001809 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001810 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 if (!res)
1812 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001813 if (max_char < 256)
1814 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1815 PyUnicode_1BYTE_DATA(res));
1816 else if (max_char < 0x10000)
1817 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1818 PyUnicode_2BYTE_DATA(res));
1819 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001821 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return res;
1823}
1824
1825PyObject*
1826PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1827{
1828 switch(kind) {
1829 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001832 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001834 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001835 default:
1836 assert(0 && "invalid kind");
1837 PyErr_SetString(PyExc_SystemError, "invalid kind");
1838 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840}
1841
Victor Stinner25a4b292011-10-06 12:31:55 +02001842/* Ensure that a string uses the most efficient storage, if it is not the
1843 case: create a new string with of the right kind. Write NULL into *p_unicode
1844 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001845static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001846unicode_adjust_maxchar(PyObject **p_unicode)
1847{
1848 PyObject *unicode, *copy;
1849 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001851 unsigned int kind;
1852
1853 assert(p_unicode != NULL);
1854 unicode = *p_unicode;
1855 assert(PyUnicode_IS_READY(unicode));
1856 if (PyUnicode_IS_ASCII(unicode))
1857 return;
1858
1859 len = PyUnicode_GET_LENGTH(unicode);
1860 kind = PyUnicode_KIND(unicode);
1861 if (kind == PyUnicode_1BYTE_KIND) {
1862 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001863 max_char = ucs1lib_find_max_char(u, u + len);
1864 if (max_char >= 128)
1865 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001866 }
1867 else if (kind == PyUnicode_2BYTE_KIND) {
1868 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001869 max_char = ucs2lib_find_max_char(u, u + len);
1870 if (max_char >= 256)
1871 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001872 }
1873 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001874 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001875 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001876 max_char = ucs4lib_find_max_char(u, u + len);
1877 if (max_char >= 0x10000)
1878 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001879 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001880 copy = PyUnicode_New(len, max_char);
1881 copy_characters(copy, 0, unicode, 0, len);
1882 Py_DECREF(unicode);
1883 *p_unicode = copy;
1884}
1885
Victor Stinner034f6cf2011-09-30 02:26:44 +02001886PyObject*
1887PyUnicode_Copy(PyObject *unicode)
1888{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001889 Py_ssize_t size;
1890 PyObject *copy;
1891 void *data;
1892
Victor Stinner034f6cf2011-09-30 02:26:44 +02001893 if (!PyUnicode_Check(unicode)) {
1894 PyErr_BadInternalCall();
1895 return NULL;
1896 }
1897 if (PyUnicode_READY(unicode))
1898 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001899
1900 size = PyUnicode_GET_LENGTH(unicode);
1901 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1902 if (!copy)
1903 return NULL;
1904 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1905
1906 data = PyUnicode_DATA(unicode);
1907 switch (PyUnicode_KIND(unicode))
1908 {
1909 case PyUnicode_1BYTE_KIND:
1910 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1911 break;
1912 case PyUnicode_2BYTE_KIND:
1913 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1914 break;
1915 case PyUnicode_4BYTE_KIND:
1916 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1917 break;
1918 default:
1919 assert(0);
1920 break;
1921 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001922 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001923 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001924}
1925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926
Victor Stinnerbc603d12011-10-02 01:00:40 +02001927/* Widen Unicode objects to larger buffers. Don't write terminating null
1928 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929
1930void*
1931_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1932{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001933 Py_ssize_t len;
1934 void *result;
1935 unsigned int skind;
1936
1937 if (PyUnicode_READY(s))
1938 return NULL;
1939
1940 len = PyUnicode_GET_LENGTH(s);
1941 skind = PyUnicode_KIND(s);
1942 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001943 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944 return NULL;
1945 }
1946 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001947 case PyUnicode_2BYTE_KIND:
1948 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1949 if (!result)
1950 return PyErr_NoMemory();
1951 assert(skind == PyUnicode_1BYTE_KIND);
1952 _PyUnicode_CONVERT_BYTES(
1953 Py_UCS1, Py_UCS2,
1954 PyUnicode_1BYTE_DATA(s),
1955 PyUnicode_1BYTE_DATA(s) + len,
1956 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001957 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001958 case PyUnicode_4BYTE_KIND:
1959 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1960 if (!result)
1961 return PyErr_NoMemory();
1962 if (skind == PyUnicode_2BYTE_KIND) {
1963 _PyUnicode_CONVERT_BYTES(
1964 Py_UCS2, Py_UCS4,
1965 PyUnicode_2BYTE_DATA(s),
1966 PyUnicode_2BYTE_DATA(s) + len,
1967 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001969 else {
1970 assert(skind == PyUnicode_1BYTE_KIND);
1971 _PyUnicode_CONVERT_BYTES(
1972 Py_UCS1, Py_UCS4,
1973 PyUnicode_1BYTE_DATA(s),
1974 PyUnicode_1BYTE_DATA(s) + len,
1975 result);
1976 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001978 default:
1979 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001980 }
Victor Stinner01698042011-10-04 00:04:26 +02001981 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001982 return NULL;
1983}
1984
1985static Py_UCS4*
1986as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1987 int copy_null)
1988{
1989 int kind;
1990 void *data;
1991 Py_ssize_t len, targetlen;
1992 if (PyUnicode_READY(string) == -1)
1993 return NULL;
1994 kind = PyUnicode_KIND(string);
1995 data = PyUnicode_DATA(string);
1996 len = PyUnicode_GET_LENGTH(string);
1997 targetlen = len;
1998 if (copy_null)
1999 targetlen++;
2000 if (!target) {
2001 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2002 PyErr_NoMemory();
2003 return NULL;
2004 }
2005 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2006 if (!target) {
2007 PyErr_NoMemory();
2008 return NULL;
2009 }
2010 }
2011 else {
2012 if (targetsize < targetlen) {
2013 PyErr_Format(PyExc_SystemError,
2014 "string is longer than the buffer");
2015 if (copy_null && 0 < targetsize)
2016 target[0] = 0;
2017 return NULL;
2018 }
2019 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002020 if (kind == PyUnicode_1BYTE_KIND) {
2021 Py_UCS1 *start = (Py_UCS1 *) data;
2022 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002023 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002024 else if (kind == PyUnicode_2BYTE_KIND) {
2025 Py_UCS2 *start = (Py_UCS2 *) data;
2026 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2027 }
2028 else {
2029 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (copy_null)
2033 target[len] = 0;
2034 return target;
2035}
2036
2037Py_UCS4*
2038PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2039 int copy_null)
2040{
2041 if (target == NULL || targetsize < 1) {
2042 PyErr_BadInternalCall();
2043 return NULL;
2044 }
2045 return as_ucs4(string, target, targetsize, copy_null);
2046}
2047
2048Py_UCS4*
2049PyUnicode_AsUCS4Copy(PyObject *string)
2050{
2051 return as_ucs4(string, NULL, 0, 1);
2052}
2053
2054#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002055
Alexander Belopolsky40018472011-02-26 01:02:56 +00002056PyObject *
2057PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002060 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002061 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002062 PyErr_BadInternalCall();
2063 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 }
2065
Martin v. Löwis790465f2008-04-05 20:41:37 +00002066 if (size == -1) {
2067 size = wcslen(w);
2068 }
2069
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002070 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071}
2072
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002073#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002074
Walter Dörwald346737f2007-05-31 10:44:43 +00002075static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002076makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2077 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002079 *fmt++ = '%';
2080 if (width) {
2081 if (zeropad)
2082 *fmt++ = '0';
2083 fmt += sprintf(fmt, "%d", width);
2084 }
2085 if (precision)
2086 fmt += sprintf(fmt, ".%d", precision);
2087 if (longflag)
2088 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002089 else if (longlongflag) {
2090 /* longlongflag should only ever be nonzero on machines with
2091 HAVE_LONG_LONG defined */
2092#ifdef HAVE_LONG_LONG
2093 char *f = PY_FORMAT_LONG_LONG;
2094 while (*f)
2095 *fmt++ = *f++;
2096#else
2097 /* we shouldn't ever get here */
2098 assert(0);
2099 *fmt++ = 'l';
2100#endif
2101 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002102 else if (size_tflag) {
2103 char *f = PY_FORMAT_SIZE_T;
2104 while (*f)
2105 *fmt++ = *f++;
2106 }
2107 *fmt++ = c;
2108 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002109}
2110
Victor Stinner96865452011-03-01 23:44:09 +00002111/* helper for PyUnicode_FromFormatV() */
2112
2113static const char*
2114parse_format_flags(const char *f,
2115 int *p_width, int *p_precision,
2116 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2117{
2118 int width, precision, longflag, longlongflag, size_tflag;
2119
2120 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2121 f++;
2122 width = 0;
2123 while (Py_ISDIGIT((unsigned)*f))
2124 width = (width*10) + *f++ - '0';
2125 precision = 0;
2126 if (*f == '.') {
2127 f++;
2128 while (Py_ISDIGIT((unsigned)*f))
2129 precision = (precision*10) + *f++ - '0';
2130 if (*f == '%') {
2131 /* "%.3%s" => f points to "3" */
2132 f--;
2133 }
2134 }
2135 if (*f == '\0') {
2136 /* bogus format "%.1" => go backward, f points to "1" */
2137 f--;
2138 }
2139 if (p_width != NULL)
2140 *p_width = width;
2141 if (p_precision != NULL)
2142 *p_precision = precision;
2143
2144 /* Handle %ld, %lu, %lld and %llu. */
2145 longflag = 0;
2146 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002147 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002148
2149 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002150 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002151 longflag = 1;
2152 ++f;
2153 }
2154#ifdef HAVE_LONG_LONG
2155 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002156 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002157 longlongflag = 1;
2158 f += 2;
2159 }
2160#endif
2161 }
2162 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002163 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002164 size_tflag = 1;
2165 ++f;
2166 }
2167 if (p_longflag != NULL)
2168 *p_longflag = longflag;
2169 if (p_longlongflag != NULL)
2170 *p_longlongflag = longlongflag;
2171 if (p_size_tflag != NULL)
2172 *p_size_tflag = size_tflag;
2173 return f;
2174}
2175
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002176/* maximum number of characters required for output of %ld. 21 characters
2177 allows for 64-bit integers (in decimal) and an optional sign. */
2178#define MAX_LONG_CHARS 21
2179/* maximum number of characters required for output of %lld.
2180 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2181 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2182#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2183
Walter Dörwaldd2034312007-05-18 16:29:38 +00002184PyObject *
2185PyUnicode_FromFormatV(const char *format, va_list vargs)
2186{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002187 va_list count;
2188 Py_ssize_t callcount = 0;
2189 PyObject **callresults = NULL;
2190 PyObject **callresult = NULL;
2191 Py_ssize_t n = 0;
2192 int width = 0;
2193 int precision = 0;
2194 int zeropad;
2195 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002196 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002197 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002198 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2200 Py_UCS4 argmaxchar;
2201 Py_ssize_t numbersize = 0;
2202 char *numberresults = NULL;
2203 char *numberresult = NULL;
2204 Py_ssize_t i;
2205 int kind;
2206 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002207
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002208 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002209 /* step 1: count the number of %S/%R/%A/%s format specifications
2210 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2211 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002212 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002213 * also estimate a upper bound for all the number formats in the string,
2214 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002216 for (f = format; *f; f++) {
2217 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002218 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2220 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2221 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2222 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002224 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002225#ifdef HAVE_LONG_LONG
2226 if (longlongflag) {
2227 if (width < MAX_LONG_LONG_CHARS)
2228 width = MAX_LONG_LONG_CHARS;
2229 }
2230 else
2231#endif
2232 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2233 including sign. Decimal takes the most space. This
2234 isn't enough for octal. If a width is specified we
2235 need more (which we allocate later). */
2236 if (width < MAX_LONG_CHARS)
2237 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238
2239 /* account for the size + '\0' to separate numbers
2240 inside of the numberresults buffer */
2241 numbersize += (width + 1);
2242 }
2243 }
2244 else if ((unsigned char)*f > 127) {
2245 PyErr_Format(PyExc_ValueError,
2246 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2247 "string, got a non-ASCII byte: 0x%02x",
2248 (unsigned char)*f);
2249 return NULL;
2250 }
2251 }
2252 /* step 2: allocate memory for the results of
2253 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2254 if (callcount) {
2255 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2256 if (!callresults) {
2257 PyErr_NoMemory();
2258 return NULL;
2259 }
2260 callresult = callresults;
2261 }
2262 /* step 2.5: allocate memory for the results of formating numbers */
2263 if (numbersize) {
2264 numberresults = PyObject_Malloc(numbersize);
2265 if (!numberresults) {
2266 PyErr_NoMemory();
2267 goto fail;
2268 }
2269 numberresult = numberresults;
2270 }
2271
2272 /* step 3: format numbers and figure out how large a buffer we need */
2273 for (f = format; *f; f++) {
2274 if (*f == '%') {
2275 const char* p;
2276 int longflag;
2277 int longlongflag;
2278 int size_tflag;
2279 int numprinted;
2280
2281 p = f;
2282 zeropad = (f[1] == '0');
2283 f = parse_format_flags(f, &width, &precision,
2284 &longflag, &longlongflag, &size_tflag);
2285 switch (*f) {
2286 case 'c':
2287 {
2288 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002289 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 n++;
2291 break;
2292 }
2293 case '%':
2294 n++;
2295 break;
2296 case 'i':
2297 case 'd':
2298 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2299 width, precision, *f);
2300 if (longflag)
2301 numprinted = sprintf(numberresult, fmt,
2302 va_arg(count, long));
2303#ifdef HAVE_LONG_LONG
2304 else if (longlongflag)
2305 numprinted = sprintf(numberresult, fmt,
2306 va_arg(count, PY_LONG_LONG));
2307#endif
2308 else if (size_tflag)
2309 numprinted = sprintf(numberresult, fmt,
2310 va_arg(count, Py_ssize_t));
2311 else
2312 numprinted = sprintf(numberresult, fmt,
2313 va_arg(count, int));
2314 n += numprinted;
2315 /* advance by +1 to skip over the '\0' */
2316 numberresult += (numprinted + 1);
2317 assert(*(numberresult - 1) == '\0');
2318 assert(*(numberresult - 2) != '\0');
2319 assert(numprinted >= 0);
2320 assert(numberresult <= numberresults + numbersize);
2321 break;
2322 case 'u':
2323 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2324 width, precision, 'u');
2325 if (longflag)
2326 numprinted = sprintf(numberresult, fmt,
2327 va_arg(count, unsigned long));
2328#ifdef HAVE_LONG_LONG
2329 else if (longlongflag)
2330 numprinted = sprintf(numberresult, fmt,
2331 va_arg(count, unsigned PY_LONG_LONG));
2332#endif
2333 else if (size_tflag)
2334 numprinted = sprintf(numberresult, fmt,
2335 va_arg(count, size_t));
2336 else
2337 numprinted = sprintf(numberresult, fmt,
2338 va_arg(count, unsigned int));
2339 n += numprinted;
2340 numberresult += (numprinted + 1);
2341 assert(*(numberresult - 1) == '\0');
2342 assert(*(numberresult - 2) != '\0');
2343 assert(numprinted >= 0);
2344 assert(numberresult <= numberresults + numbersize);
2345 break;
2346 case 'x':
2347 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2348 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2349 n += numprinted;
2350 numberresult += (numprinted + 1);
2351 assert(*(numberresult - 1) == '\0');
2352 assert(*(numberresult - 2) != '\0');
2353 assert(numprinted >= 0);
2354 assert(numberresult <= numberresults + numbersize);
2355 break;
2356 case 'p':
2357 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2358 /* %p is ill-defined: ensure leading 0x. */
2359 if (numberresult[1] == 'X')
2360 numberresult[1] = 'x';
2361 else if (numberresult[1] != 'x') {
2362 memmove(numberresult + 2, numberresult,
2363 strlen(numberresult) + 1);
2364 numberresult[0] = '0';
2365 numberresult[1] = 'x';
2366 numprinted += 2;
2367 }
2368 n += numprinted;
2369 numberresult += (numprinted + 1);
2370 assert(*(numberresult - 1) == '\0');
2371 assert(*(numberresult - 2) != '\0');
2372 assert(numprinted >= 0);
2373 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002374 break;
2375 case 's':
2376 {
2377 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002378 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002379 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2380 if (!str)
2381 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 /* since PyUnicode_DecodeUTF8 returns already flexible
2383 unicode objects, there is no need to call ready on them */
2384 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002385 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002387 /* Remember the str and switch to the next slot */
2388 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002389 break;
2390 }
2391 case 'U':
2392 {
2393 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002394 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 if (PyUnicode_READY(obj) == -1)
2396 goto fail;
2397 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002398 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002400 break;
2401 }
2402 case 'V':
2403 {
2404 PyObject *obj = va_arg(count, PyObject *);
2405 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002406 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002408 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002409 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 if (PyUnicode_READY(obj) == -1)
2411 goto fail;
2412 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002413 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002415 *callresult++ = NULL;
2416 }
2417 else {
2418 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2419 if (!str_obj)
2420 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002421 if (PyUnicode_READY(str_obj)) {
2422 Py_DECREF(str_obj);
2423 goto fail;
2424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002426 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002428 *callresult++ = str_obj;
2429 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 break;
2431 }
2432 case 'S':
2433 {
2434 PyObject *obj = va_arg(count, PyObject *);
2435 PyObject *str;
2436 assert(obj);
2437 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002439 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002441 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002443 /* Remember the str and switch to the next slot */
2444 *callresult++ = str;
2445 break;
2446 }
2447 case 'R':
2448 {
2449 PyObject *obj = va_arg(count, PyObject *);
2450 PyObject *repr;
2451 assert(obj);
2452 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002456 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002458 /* Remember the repr and switch to the next slot */
2459 *callresult++ = repr;
2460 break;
2461 }
2462 case 'A':
2463 {
2464 PyObject *obj = va_arg(count, PyObject *);
2465 PyObject *ascii;
2466 assert(obj);
2467 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002470 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002471 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 /* Remember the repr and switch to the next slot */
2474 *callresult++ = ascii;
2475 break;
2476 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002477 default:
2478 /* if we stumble upon an unknown
2479 formatting code, copy the rest of
2480 the format string to the output
2481 string. (we cannot just skip the
2482 code, since there's no way to know
2483 what's in the argument list) */
2484 n += strlen(p);
2485 goto expand;
2486 }
2487 } else
2488 n++;
2489 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002490 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002493 we don't have to resize the string.
2494 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002495 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 if (!string)
2497 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 kind = PyUnicode_KIND(string);
2499 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002504 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002505 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002506
2507 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2509 /* checking for == because the last argument could be a empty
2510 string, which causes i to point to end, the assert at the end of
2511 the loop */
2512 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002513
Benjamin Peterson14339b62009-01-31 16:36:08 +00002514 switch (*f) {
2515 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002516 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 const int ordinal = va_arg(vargs, int);
2518 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002520 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002521 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002522 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 case 'p':
2526 /* unused, since we already have the result */
2527 if (*f == 'p')
2528 (void) va_arg(vargs, void *);
2529 else
2530 (void) va_arg(vargs, int);
2531 /* extract the result from numberresults and append. */
2532 for (; *numberresult; ++i, ++numberresult)
2533 PyUnicode_WRITE(kind, data, i, *numberresult);
2534 /* skip over the separating '\0' */
2535 assert(*numberresult == '\0');
2536 numberresult++;
2537 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002538 break;
2539 case 's':
2540 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002541 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002543 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544 size = PyUnicode_GET_LENGTH(*callresult);
2545 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002546 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002548 /* We're done with the unicode()/repr() => forget it */
2549 Py_DECREF(*callresult);
2550 /* switch to next unicode()/repr() result */
2551 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002552 break;
2553 }
2554 case 'U':
2555 {
2556 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 Py_ssize_t size;
2558 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2559 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002560 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002561 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002562 break;
2563 }
2564 case 'V':
2565 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002568 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002569 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 size = PyUnicode_GET_LENGTH(obj);
2571 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002572 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002573 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002574 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 size = PyUnicode_GET_LENGTH(*callresult);
2576 assert(PyUnicode_KIND(*callresult) <=
2577 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002578 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002580 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002582 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002583 break;
2584 }
2585 case 'S':
2586 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002587 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002589 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 /* unused, since we already have the result */
2591 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002593 copy_characters(string, i, *callresult, 0, size);
2594 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002595 /* We're done with the unicode()/repr() => forget it */
2596 Py_DECREF(*callresult);
2597 /* switch to next unicode()/repr() result */
2598 ++callresult;
2599 break;
2600 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002601 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 break;
2604 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002605 for (; *p; ++p, ++i)
2606 PyUnicode_WRITE(kind, data, i, *p);
2607 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002608 goto end;
2609 }
Victor Stinner1205f272010-09-11 00:54:47 +00002610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 else {
2612 assert(i < PyUnicode_GET_LENGTH(string));
2613 PyUnicode_WRITE(kind, data, i++, *f);
2614 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002616 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002617
Benjamin Peterson29060642009-01-31 22:14:21 +00002618 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 if (callresults)
2620 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 if (numberresults)
2622 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002623 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002625 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 if (callresults) {
2627 PyObject **callresult2 = callresults;
2628 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002629 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002630 ++callresult2;
2631 }
2632 PyObject_Free(callresults);
2633 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002634 if (numberresults)
2635 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002637}
2638
Walter Dörwaldd2034312007-05-18 16:29:38 +00002639PyObject *
2640PyUnicode_FromFormat(const char *format, ...)
2641{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 PyObject* ret;
2643 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002644
2645#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002647#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002649#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 ret = PyUnicode_FromFormatV(format, vargs);
2651 va_end(vargs);
2652 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002653}
2654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002655#ifdef HAVE_WCHAR_H
2656
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2658 convert a Unicode object to a wide character string.
2659
Victor Stinnerd88d9832011-09-06 02:00:05 +02002660 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002661 character) required to convert the unicode object. Ignore size argument.
2662
Victor Stinnerd88d9832011-09-06 02:00:05 +02002663 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002664 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002665 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002666static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002667unicode_aswidechar(PyUnicodeObject *unicode,
2668 wchar_t *w,
2669 Py_ssize_t size)
2670{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002671 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 const wchar_t *wstr;
2673
2674 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2675 if (wstr == NULL)
2676 return -1;
2677
Victor Stinner5593d8a2010-10-02 11:11:27 +00002678 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002679 if (size > res)
2680 size = res + 1;
2681 else
2682 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002684 return res;
2685 }
2686 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002687 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002688}
2689
2690Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002691PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002692 wchar_t *w,
2693 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694{
2695 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002696 PyErr_BadInternalCall();
2697 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002699 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002700}
2701
Victor Stinner137c34c2010-09-29 10:25:54 +00002702wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002703PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002704 Py_ssize_t *size)
2705{
2706 wchar_t* buffer;
2707 Py_ssize_t buflen;
2708
2709 if (unicode == NULL) {
2710 PyErr_BadInternalCall();
2711 return NULL;
2712 }
2713
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002714 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 if (buflen == -1)
2716 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002718 PyErr_NoMemory();
2719 return NULL;
2720 }
2721
Victor Stinner137c34c2010-09-29 10:25:54 +00002722 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2723 if (buffer == NULL) {
2724 PyErr_NoMemory();
2725 return NULL;
2726 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002727 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002728 if (buflen == -1)
2729 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 if (size != NULL)
2731 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002732 return buffer;
2733}
2734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002735#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736
Alexander Belopolsky40018472011-02-26 01:02:56 +00002737PyObject *
2738PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002739{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002740 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002741 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002742 PyErr_SetString(PyExc_ValueError,
2743 "chr() arg not in range(0x110000)");
2744 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002745 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002746
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002747 if (ordinal < 256)
2748 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002750 v = PyUnicode_New(1, ordinal);
2751 if (v == NULL)
2752 return NULL;
2753 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002754 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002755 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002756}
2757
Alexander Belopolsky40018472011-02-26 01:02:56 +00002758PyObject *
2759PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002761 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002763 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002764 if (PyUnicode_READY(obj))
2765 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 Py_INCREF(obj);
2767 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002768 }
2769 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002770 /* For a Unicode subtype that's not a Unicode object,
2771 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002772 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002773 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002774 PyErr_Format(PyExc_TypeError,
2775 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002776 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002777 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002778}
2779
Alexander Belopolsky40018472011-02-26 01:02:56 +00002780PyObject *
2781PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002782 const char *encoding,
2783 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002784{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002785 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002786 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002787
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002789 PyErr_BadInternalCall();
2790 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002792
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002793 /* Decoding bytes objects is the most common case and should be fast */
2794 if (PyBytes_Check(obj)) {
2795 if (PyBytes_GET_SIZE(obj) == 0) {
2796 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002797 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002798 }
2799 else {
2800 v = PyUnicode_Decode(
2801 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2802 encoding, errors);
2803 }
2804 return v;
2805 }
2806
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002807 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 PyErr_SetString(PyExc_TypeError,
2809 "decoding str is not supported");
2810 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002811 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002812
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002813 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2814 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2815 PyErr_Format(PyExc_TypeError,
2816 "coercing to str: need bytes, bytearray "
2817 "or buffer-like object, %.80s found",
2818 Py_TYPE(obj)->tp_name);
2819 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002820 }
Tim Petersced69f82003-09-16 20:30:58 +00002821
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002822 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002823 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002824 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002825 }
Tim Petersced69f82003-09-16 20:30:58 +00002826 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002827 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002828
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002829 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002830 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831}
2832
Victor Stinner600d3be2010-06-10 12:00:55 +00002833/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002834 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2835 1 on success. */
2836static int
2837normalize_encoding(const char *encoding,
2838 char *lower,
2839 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002841 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002842 char *l;
2843 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002844
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002845 if (encoding == NULL) {
2846 strcpy(lower, "utf-8");
2847 return 1;
2848 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002849 e = encoding;
2850 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002851 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002852 while (*e) {
2853 if (l == l_end)
2854 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002855 if (Py_ISUPPER(*e)) {
2856 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002857 }
2858 else if (*e == '_') {
2859 *l++ = '-';
2860 e++;
2861 }
2862 else {
2863 *l++ = *e++;
2864 }
2865 }
2866 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002867 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002868}
2869
Alexander Belopolsky40018472011-02-26 01:02:56 +00002870PyObject *
2871PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002872 Py_ssize_t size,
2873 const char *encoding,
2874 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002875{
2876 PyObject *buffer = NULL, *unicode;
2877 Py_buffer info;
2878 char lower[11]; /* Enough for any encoding shortcut */
2879
Fred Drakee4315f52000-05-09 19:53:39 +00002880 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002881 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002882 if ((strcmp(lower, "utf-8") == 0) ||
2883 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002884 return PyUnicode_DecodeUTF8(s, size, errors);
2885 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002886 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002887 (strcmp(lower, "iso-8859-1") == 0))
2888 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002889#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002890 else if (strcmp(lower, "mbcs") == 0)
2891 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002892#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002893 else if (strcmp(lower, "ascii") == 0)
2894 return PyUnicode_DecodeASCII(s, size, errors);
2895 else if (strcmp(lower, "utf-16") == 0)
2896 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2897 else if (strcmp(lower, "utf-32") == 0)
2898 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2899 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900
2901 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002902 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002903 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002904 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002905 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 if (buffer == NULL)
2907 goto onError;
2908 unicode = PyCodec_Decode(buffer, encoding, errors);
2909 if (unicode == NULL)
2910 goto onError;
2911 if (!PyUnicode_Check(unicode)) {
2912 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002913 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002914 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 Py_DECREF(unicode);
2916 goto onError;
2917 }
2918 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002919#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002920 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002921 Py_DECREF(unicode);
2922 return NULL;
2923 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002924#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002925 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002927
Benjamin Peterson29060642009-01-31 22:14:21 +00002928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002929 Py_XDECREF(buffer);
2930 return NULL;
2931}
2932
Alexander Belopolsky40018472011-02-26 01:02:56 +00002933PyObject *
2934PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002935 const char *encoding,
2936 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002937{
2938 PyObject *v;
2939
2940 if (!PyUnicode_Check(unicode)) {
2941 PyErr_BadArgument();
2942 goto onError;
2943 }
2944
2945 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002946 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002947
2948 /* Decode via the codec registry */
2949 v = PyCodec_Decode(unicode, encoding, errors);
2950 if (v == NULL)
2951 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002952 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002953 return v;
2954
Benjamin Peterson29060642009-01-31 22:14:21 +00002955 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002956 return NULL;
2957}
2958
Alexander Belopolsky40018472011-02-26 01:02:56 +00002959PyObject *
2960PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002961 const char *encoding,
2962 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002963{
2964 PyObject *v;
2965
2966 if (!PyUnicode_Check(unicode)) {
2967 PyErr_BadArgument();
2968 goto onError;
2969 }
2970
2971 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002972 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002973
2974 /* Decode via the codec registry */
2975 v = PyCodec_Decode(unicode, encoding, errors);
2976 if (v == NULL)
2977 goto onError;
2978 if (!PyUnicode_Check(v)) {
2979 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002980 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002981 Py_TYPE(v)->tp_name);
2982 Py_DECREF(v);
2983 goto onError;
2984 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002985 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002986 return v;
2987
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002989 return NULL;
2990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 Py_ssize_t size,
2995 const char *encoding,
2996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002997{
2998 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002999
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 unicode = PyUnicode_FromUnicode(s, size);
3001 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3004 Py_DECREF(unicode);
3005 return v;
3006}
3007
Alexander Belopolsky40018472011-02-26 01:02:56 +00003008PyObject *
3009PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003010 const char *encoding,
3011 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012{
3013 PyObject *v;
3014
3015 if (!PyUnicode_Check(unicode)) {
3016 PyErr_BadArgument();
3017 goto onError;
3018 }
3019
3020 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003021 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003022
3023 /* Encode via the codec registry */
3024 v = PyCodec_Encode(unicode, encoding, errors);
3025 if (v == NULL)
3026 goto onError;
3027 return v;
3028
Benjamin Peterson29060642009-01-31 22:14:21 +00003029 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003030 return NULL;
3031}
3032
Victor Stinnerad158722010-10-27 00:25:46 +00003033PyObject *
3034PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003035{
Victor Stinner99b95382011-07-04 14:23:54 +02003036#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003037 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3038 PyUnicode_GET_SIZE(unicode),
3039 NULL);
3040#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003041 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003042#else
Victor Stinner793b5312011-04-27 00:24:21 +02003043 PyInterpreterState *interp = PyThreadState_GET()->interp;
3044 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3045 cannot use it to encode and decode filenames before it is loaded. Load
3046 the Python codec requires to encode at least its own filename. Use the C
3047 version of the locale codec until the codec registry is initialized and
3048 the Python codec is loaded.
3049
3050 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3051 cannot only rely on it: check also interp->fscodec_initialized for
3052 subinterpreters. */
3053 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003054 return PyUnicode_AsEncodedString(unicode,
3055 Py_FileSystemDefaultEncoding,
3056 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003057 }
3058 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003059 /* locale encoding with surrogateescape */
3060 wchar_t *wchar;
3061 char *bytes;
3062 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003063 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003064
3065 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3066 if (wchar == NULL)
3067 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003068 bytes = _Py_wchar2char(wchar, &error_pos);
3069 if (bytes == NULL) {
3070 if (error_pos != (size_t)-1) {
3071 char *errmsg = strerror(errno);
3072 PyObject *exc = NULL;
3073 if (errmsg == NULL)
3074 errmsg = "Py_wchar2char() failed";
3075 raise_encode_exception(&exc,
3076 "filesystemencoding",
3077 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3078 error_pos, error_pos+1,
3079 errmsg);
3080 Py_XDECREF(exc);
3081 }
3082 else
3083 PyErr_NoMemory();
3084 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003085 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003086 }
3087 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003088
3089 bytes_obj = PyBytes_FromString(bytes);
3090 PyMem_Free(bytes);
3091 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003092 }
Victor Stinnerad158722010-10-27 00:25:46 +00003093#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003094}
3095
Alexander Belopolsky40018472011-02-26 01:02:56 +00003096PyObject *
3097PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003098 const char *encoding,
3099 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003100{
3101 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003102 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003103
Guido van Rossumd57fd912000-03-10 22:53:23 +00003104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003106 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 }
Fred Drakee4315f52000-05-09 19:53:39 +00003108
Fred Drakee4315f52000-05-09 19:53:39 +00003109 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003110 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003111 if ((strcmp(lower, "utf-8") == 0) ||
3112 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003113 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003114 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003116 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003117 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003118 }
Victor Stinner37296e82010-06-10 13:36:23 +00003119 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003120 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003121 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003122 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003123#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003124 else if (strcmp(lower, "mbcs") == 0)
3125 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3126 PyUnicode_GET_SIZE(unicode),
3127 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003128#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003129 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003130 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003132
3133 /* Encode via the codec registry */
3134 v = PyCodec_Encode(unicode, encoding, errors);
3135 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003136 return NULL;
3137
3138 /* The normal path */
3139 if (PyBytes_Check(v))
3140 return v;
3141
3142 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003143 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003144 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003145 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003146
3147 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3148 "encoder %s returned bytearray instead of bytes",
3149 encoding);
3150 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003151 Py_DECREF(v);
3152 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003153 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003154
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003155 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3156 Py_DECREF(v);
3157 return b;
3158 }
3159
3160 PyErr_Format(PyExc_TypeError,
3161 "encoder did not return a bytes object (type=%.400s)",
3162 Py_TYPE(v)->tp_name);
3163 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003164 return NULL;
3165}
3166
Alexander Belopolsky40018472011-02-26 01:02:56 +00003167PyObject *
3168PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003169 const char *encoding,
3170 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003171{
3172 PyObject *v;
3173
3174 if (!PyUnicode_Check(unicode)) {
3175 PyErr_BadArgument();
3176 goto onError;
3177 }
3178
3179 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003180 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003181
3182 /* Encode via the codec registry */
3183 v = PyCodec_Encode(unicode, encoding, errors);
3184 if (v == NULL)
3185 goto onError;
3186 if (!PyUnicode_Check(v)) {
3187 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003188 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003189 Py_TYPE(v)->tp_name);
3190 Py_DECREF(v);
3191 goto onError;
3192 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003194
Benjamin Peterson29060642009-01-31 22:14:21 +00003195 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 return NULL;
3197}
3198
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003199PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003200PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003201 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003202 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3203}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003204
Christian Heimes5894ba72007-11-04 11:43:14 +00003205PyObject*
3206PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3207{
Victor Stinner99b95382011-07-04 14:23:54 +02003208#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003209 return PyUnicode_DecodeMBCS(s, size, NULL);
3210#elif defined(__APPLE__)
3211 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3212#else
Victor Stinner793b5312011-04-27 00:24:21 +02003213 PyInterpreterState *interp = PyThreadState_GET()->interp;
3214 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3215 cannot use it to encode and decode filenames before it is loaded. Load
3216 the Python codec requires to encode at least its own filename. Use the C
3217 version of the locale codec until the codec registry is initialized and
3218 the Python codec is loaded.
3219
3220 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3221 cannot only rely on it: check also interp->fscodec_initialized for
3222 subinterpreters. */
3223 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003224 return PyUnicode_Decode(s, size,
3225 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003226 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003227 }
3228 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003229 /* locale encoding with surrogateescape */
3230 wchar_t *wchar;
3231 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003232 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003233
3234 if (s[size] != '\0' || size != strlen(s)) {
3235 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3236 return NULL;
3237 }
3238
Victor Stinner168e1172010-10-16 23:16:16 +00003239 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003240 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003241 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003242
Victor Stinner168e1172010-10-16 23:16:16 +00003243 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003244 PyMem_Free(wchar);
3245 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003246 }
Victor Stinnerad158722010-10-27 00:25:46 +00003247#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248}
3249
Martin v. Löwis011e8422009-05-05 04:43:17 +00003250
3251int
3252PyUnicode_FSConverter(PyObject* arg, void* addr)
3253{
3254 PyObject *output = NULL;
3255 Py_ssize_t size;
3256 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003257 if (arg == NULL) {
3258 Py_DECREF(*(PyObject**)addr);
3259 return 1;
3260 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003261 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003262 output = arg;
3263 Py_INCREF(output);
3264 }
3265 else {
3266 arg = PyUnicode_FromObject(arg);
3267 if (!arg)
3268 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003269 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003270 Py_DECREF(arg);
3271 if (!output)
3272 return 0;
3273 if (!PyBytes_Check(output)) {
3274 Py_DECREF(output);
3275 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3276 return 0;
3277 }
3278 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003279 size = PyBytes_GET_SIZE(output);
3280 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003281 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003282 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003283 Py_DECREF(output);
3284 return 0;
3285 }
3286 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003287 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003288}
3289
3290
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003291int
3292PyUnicode_FSDecoder(PyObject* arg, void* addr)
3293{
3294 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003295 if (arg == NULL) {
3296 Py_DECREF(*(PyObject**)addr);
3297 return 1;
3298 }
3299 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003300 if (PyUnicode_READY(arg))
3301 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003302 output = arg;
3303 Py_INCREF(output);
3304 }
3305 else {
3306 arg = PyBytes_FromObject(arg);
3307 if (!arg)
3308 return 0;
3309 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3310 PyBytes_GET_SIZE(arg));
3311 Py_DECREF(arg);
3312 if (!output)
3313 return 0;
3314 if (!PyUnicode_Check(output)) {
3315 Py_DECREF(output);
3316 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3317 return 0;
3318 }
3319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003320 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003321 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003322 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3323 Py_DECREF(output);
3324 return 0;
3325 }
3326 *(PyObject**)addr = output;
3327 return Py_CLEANUP_SUPPORTED;
3328}
3329
3330
Martin v. Löwis5b222132007-06-10 09:51:05 +00003331char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003332PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003333{
Christian Heimesf3863112007-11-22 07:46:41 +00003334 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003335 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3336
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003337 if (!PyUnicode_Check(unicode)) {
3338 PyErr_BadArgument();
3339 return NULL;
3340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003341 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003342 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003344 if (PyUnicode_UTF8(unicode) == NULL) {
3345 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3347 if (bytes == NULL)
3348 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003349 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3350 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 Py_DECREF(bytes);
3352 return NULL;
3353 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003354 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3355 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003356 Py_DECREF(bytes);
3357 }
3358
3359 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003360 *psize = PyUnicode_UTF8_LENGTH(unicode);
3361 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003362}
3363
3364char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003365PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3368}
3369
3370#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003371static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003372#endif
3373
3374
3375Py_UNICODE *
3376PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3377{
3378 PyUnicodeObject *u;
3379 const unsigned char *one_byte;
3380#if SIZEOF_WCHAR_T == 4
3381 const Py_UCS2 *two_bytes;
3382#else
3383 const Py_UCS4 *four_bytes;
3384 const Py_UCS4 *ucs4_end;
3385 Py_ssize_t num_surrogates;
3386#endif
3387 wchar_t *w;
3388 wchar_t *wchar_end;
3389
3390 if (!PyUnicode_Check(unicode)) {
3391 PyErr_BadArgument();
3392 return NULL;
3393 }
3394 u = (PyUnicodeObject*)unicode;
3395 if (_PyUnicode_WSTR(u) == NULL) {
3396 /* Non-ASCII compact unicode object */
3397 assert(_PyUnicode_KIND(u) != 0);
3398 assert(PyUnicode_IS_READY(u));
3399
3400#ifdef Py_DEBUG
3401 ++unicode_as_unicode_calls;
3402#endif
3403
3404 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3405#if SIZEOF_WCHAR_T == 2
3406 four_bytes = PyUnicode_4BYTE_DATA(u);
3407 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3408 num_surrogates = 0;
3409
3410 for (; four_bytes < ucs4_end; ++four_bytes) {
3411 if (*four_bytes > 0xFFFF)
3412 ++num_surrogates;
3413 }
3414
3415 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3416 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3417 if (!_PyUnicode_WSTR(u)) {
3418 PyErr_NoMemory();
3419 return NULL;
3420 }
3421 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3422
3423 w = _PyUnicode_WSTR(u);
3424 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3425 four_bytes = PyUnicode_4BYTE_DATA(u);
3426 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3427 if (*four_bytes > 0xFFFF) {
3428 /* encode surrogate pair in this case */
3429 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3430 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3431 }
3432 else
3433 *w = *four_bytes;
3434
3435 if (w > wchar_end) {
3436 assert(0 && "Miscalculated string end");
3437 }
3438 }
3439 *w = 0;
3440#else
3441 /* sizeof(wchar_t) == 4 */
3442 Py_FatalError("Impossible unicode object state, wstr and str "
3443 "should share memory already.");
3444 return NULL;
3445#endif
3446 }
3447 else {
3448 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3449 (_PyUnicode_LENGTH(u) + 1));
3450 if (!_PyUnicode_WSTR(u)) {
3451 PyErr_NoMemory();
3452 return NULL;
3453 }
3454 if (!PyUnicode_IS_COMPACT_ASCII(u))
3455 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3456 w = _PyUnicode_WSTR(u);
3457 wchar_end = w + _PyUnicode_LENGTH(u);
3458
3459 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3460 one_byte = PyUnicode_1BYTE_DATA(u);
3461 for (; w < wchar_end; ++one_byte, ++w)
3462 *w = *one_byte;
3463 /* null-terminate the wstr */
3464 *w = 0;
3465 }
3466 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3467#if SIZEOF_WCHAR_T == 4
3468 two_bytes = PyUnicode_2BYTE_DATA(u);
3469 for (; w < wchar_end; ++two_bytes, ++w)
3470 *w = *two_bytes;
3471 /* null-terminate the wstr */
3472 *w = 0;
3473#else
3474 /* sizeof(wchar_t) == 2 */
3475 PyObject_FREE(_PyUnicode_WSTR(u));
3476 _PyUnicode_WSTR(u) = NULL;
3477 Py_FatalError("Impossible unicode object state, wstr "
3478 "and str should share memory already.");
3479 return NULL;
3480#endif
3481 }
3482 else {
3483 assert(0 && "This should never happen.");
3484 }
3485 }
3486 }
3487 if (size != NULL)
3488 *size = PyUnicode_WSTR_LENGTH(u);
3489 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003490}
3491
Alexander Belopolsky40018472011-02-26 01:02:56 +00003492Py_UNICODE *
3493PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003495 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496}
3497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003498
Alexander Belopolsky40018472011-02-26 01:02:56 +00003499Py_ssize_t
3500PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501{
3502 if (!PyUnicode_Check(unicode)) {
3503 PyErr_BadArgument();
3504 goto onError;
3505 }
3506 return PyUnicode_GET_SIZE(unicode);
3507
Benjamin Peterson29060642009-01-31 22:14:21 +00003508 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 return -1;
3510}
3511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003512Py_ssize_t
3513PyUnicode_GetLength(PyObject *unicode)
3514{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003515 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003516 PyErr_BadArgument();
3517 return -1;
3518 }
3519
3520 return PyUnicode_GET_LENGTH(unicode);
3521}
3522
3523Py_UCS4
3524PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3525{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003526 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3527 PyErr_BadArgument();
3528 return (Py_UCS4)-1;
3529 }
3530 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3531 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003532 return (Py_UCS4)-1;
3533 }
3534 return PyUnicode_READ_CHAR(unicode, index);
3535}
3536
3537int
3538PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3539{
3540 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003541 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003542 return -1;
3543 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003544 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3545 PyErr_SetString(PyExc_IndexError, "string index out of range");
3546 return -1;
3547 }
3548 if (_PyUnicode_Dirty(unicode))
3549 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003550 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3551 index, ch);
3552 return 0;
3553}
3554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555const char *
3556PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003557{
Victor Stinner42cb4622010-09-01 19:39:01 +00003558 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003559}
3560
Victor Stinner554f3f02010-06-16 23:33:54 +00003561/* create or adjust a UnicodeDecodeError */
3562static void
3563make_decode_exception(PyObject **exceptionObject,
3564 const char *encoding,
3565 const char *input, Py_ssize_t length,
3566 Py_ssize_t startpos, Py_ssize_t endpos,
3567 const char *reason)
3568{
3569 if (*exceptionObject == NULL) {
3570 *exceptionObject = PyUnicodeDecodeError_Create(
3571 encoding, input, length, startpos, endpos, reason);
3572 }
3573 else {
3574 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3575 goto onError;
3576 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3577 goto onError;
3578 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3579 goto onError;
3580 }
3581 return;
3582
3583onError:
3584 Py_DECREF(*exceptionObject);
3585 *exceptionObject = NULL;
3586}
3587
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003588/* error handling callback helper:
3589 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003590 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003591 and adjust various state variables.
3592 return 0 on success, -1 on error
3593*/
3594
Alexander Belopolsky40018472011-02-26 01:02:56 +00003595static int
3596unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003597 const char *encoding, const char *reason,
3598 const char **input, const char **inend, Py_ssize_t *startinpos,
3599 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3600 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003602 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603
3604 PyObject *restuple = NULL;
3605 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003606 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003607 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003608 Py_ssize_t requiredsize;
3609 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003610 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003611 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003612 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003613 int res = -1;
3614
3615 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003616 *errorHandler = PyCodec_LookupError(errors);
3617 if (*errorHandler == NULL)
3618 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 }
3620
Victor Stinner554f3f02010-06-16 23:33:54 +00003621 make_decode_exception(exceptionObject,
3622 encoding,
3623 *input, *inend - *input,
3624 *startinpos, *endinpos,
3625 reason);
3626 if (*exceptionObject == NULL)
3627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628
3629 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3630 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003631 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003632 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003633 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003634 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 }
3636 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003637 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003638
3639 /* Copy back the bytes variables, which might have been modified by the
3640 callback */
3641 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3642 if (!inputobj)
3643 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003644 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003645 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003646 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003647 *input = PyBytes_AS_STRING(inputobj);
3648 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003649 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003650 /* we can DECREF safely, as the exception has another reference,
3651 so the object won't go away. */
3652 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003653
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003654 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003655 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003656 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3658 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003659 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003660
3661 /* need more space? (at least enough for what we
3662 have+the replacement+the rest of the string (starting
3663 at the new input position), so we won't have to check space
3664 when there are no errors in the rest of the string) */
3665 repptr = PyUnicode_AS_UNICODE(repunicode);
3666 repsize = PyUnicode_GET_SIZE(repunicode);
3667 requiredsize = *outpos + repsize + insize-newpos;
3668 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003669 if (requiredsize<2*outsize)
3670 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003671 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 goto onError;
3673 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 }
3675 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003676 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003677 Py_UNICODE_COPY(*outptr, repptr, repsize);
3678 *outptr += repsize;
3679 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003680
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003681 /* we made it! */
3682 res = 0;
3683
Benjamin Peterson29060642009-01-31 22:14:21 +00003684 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 Py_XDECREF(restuple);
3686 return res;
3687}
3688
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689/* --- UTF-7 Codec -------------------------------------------------------- */
3690
Antoine Pitrou244651a2009-05-04 18:56:13 +00003691/* See RFC2152 for details. We encode conservatively and decode liberally. */
3692
3693/* Three simple macros defining base-64. */
3694
3695/* Is c a base-64 character? */
3696
3697#define IS_BASE64(c) \
3698 (((c) >= 'A' && (c) <= 'Z') || \
3699 ((c) >= 'a' && (c) <= 'z') || \
3700 ((c) >= '0' && (c) <= '9') || \
3701 (c) == '+' || (c) == '/')
3702
3703/* given that c is a base-64 character, what is its base-64 value? */
3704
3705#define FROM_BASE64(c) \
3706 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3707 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3708 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3709 (c) == '+' ? 62 : 63)
3710
3711/* What is the base-64 character of the bottom 6 bits of n? */
3712
3713#define TO_BASE64(n) \
3714 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3715
3716/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3717 * decoded as itself. We are permissive on decoding; the only ASCII
3718 * byte not decoding to itself is the + which begins a base64
3719 * string. */
3720
3721#define DECODE_DIRECT(c) \
3722 ((c) <= 127 && (c) != '+')
3723
3724/* The UTF-7 encoder treats ASCII characters differently according to
3725 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3726 * the above). See RFC2152. This array identifies these different
3727 * sets:
3728 * 0 : "Set D"
3729 * alphanumeric and '(),-./:?
3730 * 1 : "Set O"
3731 * !"#$%&*;<=>@[]^_`{|}
3732 * 2 : "whitespace"
3733 * ht nl cr sp
3734 * 3 : special (must be base64 encoded)
3735 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3736 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003737
Tim Petersced69f82003-09-16 20:30:58 +00003738static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003739char utf7_category[128] = {
3740/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3741 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3742/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3743 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3744/* sp ! " # $ % & ' ( ) * + , - . / */
3745 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3746/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3747 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3748/* @ A B C D E F G H I J K L M N O */
3749 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3750/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3751 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3752/* ` a b c d e f g h i j k l m n o */
3753 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3754/* p q r s t u v w x y z { | } ~ del */
3755 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003756};
3757
Antoine Pitrou244651a2009-05-04 18:56:13 +00003758/* ENCODE_DIRECT: this character should be encoded as itself. The
3759 * answer depends on whether we are encoding set O as itself, and also
3760 * on whether we are encoding whitespace as itself. RFC2152 makes it
3761 * clear that the answers to these questions vary between
3762 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003763
Antoine Pitrou244651a2009-05-04 18:56:13 +00003764#define ENCODE_DIRECT(c, directO, directWS) \
3765 ((c) < 128 && (c) > 0 && \
3766 ((utf7_category[(c)] == 0) || \
3767 (directWS && (utf7_category[(c)] == 2)) || \
3768 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003769
Alexander Belopolsky40018472011-02-26 01:02:56 +00003770PyObject *
3771PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003772 Py_ssize_t size,
3773 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003774{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003775 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3776}
3777
Antoine Pitrou244651a2009-05-04 18:56:13 +00003778/* The decoder. The only state we preserve is our read position,
3779 * i.e. how many characters we have consumed. So if we end in the
3780 * middle of a shift sequence we have to back off the read position
3781 * and the output to the beginning of the sequence, otherwise we lose
3782 * all the shift state (seen bits, number of bits seen, high
3783 * surrogate). */
3784
Alexander Belopolsky40018472011-02-26 01:02:56 +00003785PyObject *
3786PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003787 Py_ssize_t size,
3788 const char *errors,
3789 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003790{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003791 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003792 Py_ssize_t startinpos;
3793 Py_ssize_t endinpos;
3794 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795 const char *e;
3796 PyUnicodeObject *unicode;
3797 Py_UNICODE *p;
3798 const char *errmsg = "";
3799 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003800 Py_UNICODE *shiftOutStart;
3801 unsigned int base64bits = 0;
3802 unsigned long base64buffer = 0;
3803 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003804 PyObject *errorHandler = NULL;
3805 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003806
3807 unicode = _PyUnicode_New(size);
3808 if (!unicode)
3809 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003810 if (size == 0) {
3811 if (consumed)
3812 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003813 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003814 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003816 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003817 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003818 e = s + size;
3819
3820 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003821 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003822 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003823 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003824
Antoine Pitrou244651a2009-05-04 18:56:13 +00003825 if (inShift) { /* in a base-64 section */
3826 if (IS_BASE64(ch)) { /* consume a base-64 character */
3827 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3828 base64bits += 6;
3829 s++;
3830 if (base64bits >= 16) {
3831 /* we have enough bits for a UTF-16 value */
3832 Py_UNICODE outCh = (Py_UNICODE)
3833 (base64buffer >> (base64bits-16));
3834 base64bits -= 16;
3835 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3836 if (surrogate) {
3837 /* expecting a second surrogate */
3838 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3839#ifdef Py_UNICODE_WIDE
3840 *p++ = (((surrogate & 0x3FF)<<10)
3841 | (outCh & 0x3FF)) + 0x10000;
3842#else
3843 *p++ = surrogate;
3844 *p++ = outCh;
3845#endif
3846 surrogate = 0;
3847 }
3848 else {
3849 surrogate = 0;
3850 errmsg = "second surrogate missing";
3851 goto utf7Error;
3852 }
3853 }
3854 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3855 /* first surrogate */
3856 surrogate = outCh;
3857 }
3858 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3859 errmsg = "unexpected second surrogate";
3860 goto utf7Error;
3861 }
3862 else {
3863 *p++ = outCh;
3864 }
3865 }
3866 }
3867 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003868 inShift = 0;
3869 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003870 if (surrogate) {
3871 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003872 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003873 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003874 if (base64bits > 0) { /* left-over bits */
3875 if (base64bits >= 6) {
3876 /* We've seen at least one base-64 character */
3877 errmsg = "partial character in shift sequence";
3878 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003879 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003880 else {
3881 /* Some bits remain; they should be zero */
3882 if (base64buffer != 0) {
3883 errmsg = "non-zero padding bits in shift sequence";
3884 goto utf7Error;
3885 }
3886 }
3887 }
3888 if (ch != '-') {
3889 /* '-' is absorbed; other terminating
3890 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003891 *p++ = ch;
3892 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893 }
3894 }
3895 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003896 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 s++; /* consume '+' */
3898 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899 s++;
3900 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 }
3902 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003903 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003904 shiftOutStart = p;
3905 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003906 }
3907 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003908 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003909 *p++ = ch;
3910 s++;
3911 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912 else {
3913 startinpos = s-starts;
3914 s++;
3915 errmsg = "unexpected special character";
3916 goto utf7Error;
3917 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003918 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003919utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003920 outpos = p-PyUnicode_AS_UNICODE(unicode);
3921 endinpos = s-starts;
3922 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003923 errors, &errorHandler,
3924 "utf7", errmsg,
3925 &starts, &e, &startinpos, &endinpos, &exc, &s,
3926 &unicode, &outpos, &p))
3927 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003928 }
3929
Antoine Pitrou244651a2009-05-04 18:56:13 +00003930 /* end of string */
3931
3932 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3933 /* if we're in an inconsistent state, that's an error */
3934 if (surrogate ||
3935 (base64bits >= 6) ||
3936 (base64bits > 0 && base64buffer != 0)) {
3937 outpos = p-PyUnicode_AS_UNICODE(unicode);
3938 endinpos = size;
3939 if (unicode_decode_call_errorhandler(
3940 errors, &errorHandler,
3941 "utf7", "unterminated shift sequence",
3942 &starts, &e, &startinpos, &endinpos, &exc, &s,
3943 &unicode, &outpos, &p))
3944 goto onError;
3945 if (s < e)
3946 goto restart;
3947 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003948 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003949
3950 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003951 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003952 if (inShift) {
3953 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003954 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003955 }
3956 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003957 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003958 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003959 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003960
Victor Stinnerfe226c02011-10-03 03:52:20 +02003961 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962 goto onError;
3963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003964 Py_XDECREF(errorHandler);
3965 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003966#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003967 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 Py_DECREF(unicode);
3969 return NULL;
3970 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003971#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003972 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003973 return (PyObject *)unicode;
3974
Benjamin Peterson29060642009-01-31 22:14:21 +00003975 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976 Py_XDECREF(errorHandler);
3977 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978 Py_DECREF(unicode);
3979 return NULL;
3980}
3981
3982
Alexander Belopolsky40018472011-02-26 01:02:56 +00003983PyObject *
3984PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003985 Py_ssize_t size,
3986 int base64SetO,
3987 int base64WhiteSpace,
3988 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003989{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003990 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003991 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003992 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003994 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003995 unsigned int base64bits = 0;
3996 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003997 char * out;
3998 char * start;
3999
4000 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004001 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00004003 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004004 return PyErr_NoMemory();
4005
Antoine Pitrou244651a2009-05-04 18:56:13 +00004006 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007 if (v == NULL)
4008 return NULL;
4009
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004010 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004011 for (;i < size; ++i) {
4012 Py_UNICODE ch = s[i];
4013
Antoine Pitrou244651a2009-05-04 18:56:13 +00004014 if (inShift) {
4015 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4016 /* shifting out */
4017 if (base64bits) { /* output remaining bits */
4018 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4019 base64buffer = 0;
4020 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004021 }
4022 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004023 /* Characters not in the BASE64 set implicitly unshift the sequence
4024 so no '-' is required, except if the character is itself a '-' */
4025 if (IS_BASE64(ch) || ch == '-') {
4026 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004027 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004028 *out++ = (char) ch;
4029 }
4030 else {
4031 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004032 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004033 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004034 else { /* not in a shift sequence */
4035 if (ch == '+') {
4036 *out++ = '+';
4037 *out++ = '-';
4038 }
4039 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4040 *out++ = (char) ch;
4041 }
4042 else {
4043 *out++ = '+';
4044 inShift = 1;
4045 goto encode_char;
4046 }
4047 }
4048 continue;
4049encode_char:
4050#ifdef Py_UNICODE_WIDE
4051 if (ch >= 0x10000) {
4052 /* code first surrogate */
4053 base64bits += 16;
4054 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4055 while (base64bits >= 6) {
4056 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4057 base64bits -= 6;
4058 }
4059 /* prepare second surrogate */
4060 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4061 }
4062#endif
4063 base64bits += 16;
4064 base64buffer = (base64buffer << 16) | ch;
4065 while (base64bits >= 6) {
4066 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4067 base64bits -= 6;
4068 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004069 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004070 if (base64bits)
4071 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4072 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004073 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004074 if (_PyBytes_Resize(&v, out - start) < 0)
4075 return NULL;
4076 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004077}
4078
Antoine Pitrou244651a2009-05-04 18:56:13 +00004079#undef IS_BASE64
4080#undef FROM_BASE64
4081#undef TO_BASE64
4082#undef DECODE_DIRECT
4083#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004084
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085/* --- UTF-8 Codec -------------------------------------------------------- */
4086
Tim Petersced69f82003-09-16 20:30:58 +00004087static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004089 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4090 illegal prefix. See RFC 3629 for details */
4091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4103 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4104 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4105 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4106 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107};
4108
Alexander Belopolsky40018472011-02-26 01:02:56 +00004109PyObject *
4110PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004111 Py_ssize_t size,
4112 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113{
Walter Dörwald69652032004-09-07 20:24:22 +00004114 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4115}
4116
Antoine Pitrouab868312009-01-10 15:40:25 +00004117/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4118#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4119
4120/* Mask to quickly check whether a C 'long' contains a
4121 non-ASCII, UTF8-encoded char. */
4122#if (SIZEOF_LONG == 8)
4123# define ASCII_CHAR_MASK 0x8080808080808080L
4124#elif (SIZEOF_LONG == 4)
4125# define ASCII_CHAR_MASK 0x80808080L
4126#else
4127# error C 'long' size should be either 4 or 8!
4128#endif
4129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004130/* Scans a UTF-8 string and returns the maximum character to be expected,
4131 the size of the decoded unicode string and if any major errors were
4132 encountered.
4133
4134 This function does check basic UTF-8 sanity, it does however NOT CHECK
4135 if the string contains surrogates, and if all continuation bytes are
4136 within the correct ranges, these checks are performed in
4137 PyUnicode_DecodeUTF8Stateful.
4138
4139 If it sets has_errors to 1, it means the value of unicode_size and max_char
4140 will be bogus and you should not rely on useful information in them.
4141 */
4142static Py_UCS4
4143utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4144 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4145 int *has_errors)
4146{
4147 Py_ssize_t n;
4148 Py_ssize_t char_count = 0;
4149 Py_UCS4 max_char = 127, new_max;
4150 Py_UCS4 upper_bound;
4151 const unsigned char *p = (const unsigned char *)s;
4152 const unsigned char *end = p + string_size;
4153 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4154 int err = 0;
4155
4156 for (; p < end && !err; ++p, ++char_count) {
4157 /* Only check value if it's not a ASCII char... */
4158 if (*p < 0x80) {
4159 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4160 an explanation. */
4161 if (!((size_t) p & LONG_PTR_MASK)) {
4162 /* Help register allocation */
4163 register const unsigned char *_p = p;
4164 while (_p < aligned_end) {
4165 unsigned long value = *(unsigned long *) _p;
4166 if (value & ASCII_CHAR_MASK)
4167 break;
4168 _p += SIZEOF_LONG;
4169 char_count += SIZEOF_LONG;
4170 }
4171 p = _p;
4172 if (p == end)
4173 break;
4174 }
4175 }
4176 if (*p >= 0x80) {
4177 n = utf8_code_length[*p];
4178 new_max = max_char;
4179 switch (n) {
4180 /* invalid start byte */
4181 case 0:
4182 err = 1;
4183 break;
4184 case 2:
4185 /* Code points between 0x00FF and 0x07FF inclusive.
4186 Approximate the upper bound of the code point,
4187 if this flips over 255 we can be sure it will be more
4188 than 255 and the string will need 2 bytes per code coint,
4189 if it stays under or equal to 255, we can be sure 1 byte
4190 is enough.
4191 ((*p & 0b00011111) << 6) | 0b00111111 */
4192 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4193 if (max_char < upper_bound)
4194 new_max = upper_bound;
4195 /* Ensure we track at least that we left ASCII space. */
4196 if (new_max < 128)
4197 new_max = 128;
4198 break;
4199 case 3:
4200 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4201 always > 255 and <= 65535 and will always need 2 bytes. */
4202 if (max_char < 65535)
4203 new_max = 65535;
4204 break;
4205 case 4:
4206 /* Code point will be above 0xFFFF for sure in this case. */
4207 new_max = 65537;
4208 break;
4209 /* Internal error, this should be caught by the first if */
4210 case 1:
4211 default:
4212 assert(0 && "Impossible case in utf8_max_char_and_size");
4213 err = 1;
4214 }
4215 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004216 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004217 --n;
4218 /* Check if the follow up chars are all valid continuation bytes */
4219 if (n >= 1) {
4220 const unsigned char *cont;
4221 if ((p + n) >= end) {
4222 if (consumed == 0)
4223 /* incomplete data, non-incremental decoding */
4224 err = 1;
4225 break;
4226 }
4227 for (cont = p + 1; cont < (p + n); ++cont) {
4228 if ((*cont & 0xc0) != 0x80) {
4229 err = 1;
4230 break;
4231 }
4232 }
4233 p += n;
4234 }
4235 else
4236 err = 1;
4237 max_char = new_max;
4238 }
4239 }
4240
4241 if (unicode_size)
4242 *unicode_size = char_count;
4243 if (has_errors)
4244 *has_errors = err;
4245 return max_char;
4246}
4247
4248/* Similar to PyUnicode_WRITE but can also write into wstr field
4249 of the legacy unicode representation */
4250#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4251 do { \
4252 const int k_ = (kind); \
4253 if (k_ == PyUnicode_WCHAR_KIND) \
4254 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4255 else if (k_ == PyUnicode_1BYTE_KIND) \
4256 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4257 else if (k_ == PyUnicode_2BYTE_KIND) \
4258 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4259 else \
4260 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4261 } while (0)
4262
Alexander Belopolsky40018472011-02-26 01:02:56 +00004263PyObject *
4264PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004265 Py_ssize_t size,
4266 const char *errors,
4267 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004268{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004271 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004272 Py_ssize_t startinpos;
4273 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004274 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004276 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004277 PyObject *errorHandler = NULL;
4278 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004279 Py_UCS4 maxchar = 0;
4280 Py_ssize_t unicode_size;
4281 Py_ssize_t i;
4282 int kind;
4283 void *data;
4284 int has_errors;
4285 Py_UNICODE *error_outptr;
4286#if SIZEOF_WCHAR_T == 2
4287 Py_ssize_t wchar_offset = 0;
4288#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004289
Walter Dörwald69652032004-09-07 20:24:22 +00004290 if (size == 0) {
4291 if (consumed)
4292 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004294 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4296 consumed, &has_errors);
4297 if (has_errors) {
4298 unicode = _PyUnicode_New(size);
4299 if (!unicode)
4300 return NULL;
4301 kind = PyUnicode_WCHAR_KIND;
4302 data = PyUnicode_AS_UNICODE(unicode);
4303 assert(data != NULL);
4304 }
4305 else {
4306 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4307 if (!unicode)
4308 return NULL;
4309 /* When the string is ASCII only, just use memcpy and return.
4310 unicode_size may be != size if there is an incomplete UTF-8
4311 sequence at the end of the ASCII block. */
4312 if (maxchar < 128 && size == unicode_size) {
4313 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4314 return (PyObject *)unicode;
4315 }
4316 kind = PyUnicode_KIND(unicode);
4317 data = PyUnicode_DATA(unicode);
4318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004320 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004322 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
4324 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004325 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326
4327 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004328 /* Fast path for runs of ASCII characters. Given that common UTF-8
4329 input will consist of an overwhelming majority of ASCII
4330 characters, we try to optimize for this case by checking
4331 as many characters as a C 'long' can contain.
4332 First, check if we can do an aligned read, as most CPUs have
4333 a penalty for unaligned reads.
4334 */
4335 if (!((size_t) s & LONG_PTR_MASK)) {
4336 /* Help register allocation */
4337 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004339 while (_s < aligned_end) {
4340 /* Read a whole long at a time (either 4 or 8 bytes),
4341 and do a fast unrolled copy if it only contains ASCII
4342 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343 unsigned long value = *(unsigned long *) _s;
4344 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004345 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004346 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4347 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4348 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004350#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004351 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4352 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4353 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4354 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004355#endif
4356 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004358 }
4359 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004360 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004361 if (s == e)
4362 break;
4363 ch = (unsigned char)*s;
4364 }
4365 }
4366
4367 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004368 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 s++;
4370 continue;
4371 }
4372
4373 n = utf8_code_length[ch];
4374
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004375 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004376 if (consumed)
4377 break;
4378 else {
4379 errmsg = "unexpected end of data";
4380 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004381 endinpos = startinpos+1;
4382 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4383 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004384 goto utf8Error;
4385 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387
4388 switch (n) {
4389
4390 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004391 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 startinpos = s-starts;
4393 endinpos = startinpos+1;
4394 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395
4396 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004397 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 startinpos = s-starts;
4399 endinpos = startinpos+1;
4400 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401
4402 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004403 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004404 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004405 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 goto utf8Error;
4408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004410 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004411 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 break;
4413
4414 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004415 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4416 will result in surrogates in range d800-dfff. Surrogates are
4417 not valid UTF-8 so they are rejected.
4418 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4419 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004420 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004421 (s[2] & 0xc0) != 0x80 ||
4422 ((unsigned char)s[0] == 0xE0 &&
4423 (unsigned char)s[1] < 0xA0) ||
4424 ((unsigned char)s[0] == 0xED &&
4425 (unsigned char)s[1] > 0x9F)) {
4426 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004428 endinpos = startinpos + 1;
4429
4430 /* if s[1] first two bits are 1 and 0, then the invalid
4431 continuation byte is s[2], so increment endinpos by 1,
4432 if not, s[1] is invalid and endinpos doesn't need to
4433 be incremented. */
4434 if ((s[1] & 0xC0) == 0x80)
4435 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 goto utf8Error;
4437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004439 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004440 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004441 break;
4442
4443 case 4:
4444 if ((s[1] & 0xc0) != 0x80 ||
4445 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004446 (s[3] & 0xc0) != 0x80 ||
4447 ((unsigned char)s[0] == 0xF0 &&
4448 (unsigned char)s[1] < 0x90) ||
4449 ((unsigned char)s[0] == 0xF4 &&
4450 (unsigned char)s[1] > 0x8F)) {
4451 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004453 endinpos = startinpos + 1;
4454 if ((s[1] & 0xC0) == 0x80) {
4455 endinpos++;
4456 if ((s[2] & 0xC0) == 0x80)
4457 endinpos++;
4458 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 goto utf8Error;
4460 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004461 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004462 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4463 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004465 /* If the string is flexible or we have native UCS-4, write
4466 directly.. */
4467 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4468 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470 else {
4471 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004472
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004473 /* translate from 10000..10FFFF to 0..FFFF */
4474 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004476 /* high surrogate = top 10 bits added to D800 */
4477 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4478 (Py_UNICODE)(0xD800 + (ch >> 10)));
4479
4480 /* low surrogate = bottom 10 bits added to DC00 */
4481 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4482 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4483 }
4484#if SIZEOF_WCHAR_T == 2
4485 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004486#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004487 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 }
4489 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004491
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004493 /* If this is not yet a resizable string, make it one.. */
4494 if (kind != PyUnicode_WCHAR_KIND) {
4495 const Py_UNICODE *u;
4496 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4497 if (!new_unicode)
4498 goto onError;
4499 u = PyUnicode_AsUnicode((PyObject *)unicode);
4500 if (!u)
4501 goto onError;
4502#if SIZEOF_WCHAR_T == 2
4503 i += wchar_offset;
4504#endif
4505 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4506 Py_DECREF(unicode);
4507 unicode = new_unicode;
4508 kind = 0;
4509 data = PyUnicode_AS_UNICODE(new_unicode);
4510 assert(data != NULL);
4511 }
4512 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004513 if (unicode_decode_call_errorhandler(
4514 errors, &errorHandler,
4515 "utf8", errmsg,
4516 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004517 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519 /* Update data because unicode_decode_call_errorhandler might have
4520 re-created or resized the unicode object. */
4521 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004522 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004523 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004524 /* Ensure the unicode_size calculation above was correct: */
4525 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4526
Walter Dörwald69652032004-09-07 20:24:22 +00004527 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004528 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004530 /* Adjust length and ready string when it contained errors and
4531 is of the old resizable kind. */
4532 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004533 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534 goto onError;
4535 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004536
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004537 Py_XDECREF(errorHandler);
4538 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004539#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004540 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004541 Py_DECREF(unicode);
4542 return NULL;
4543 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004544#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004545 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004546 return (PyObject *)unicode;
4547
Benjamin Peterson29060642009-01-31 22:14:21 +00004548 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 Py_XDECREF(errorHandler);
4550 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 Py_DECREF(unicode);
4552 return NULL;
4553}
4554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004555#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004556
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004557#ifdef __APPLE__
4558
4559/* Simplified UTF-8 decoder using surrogateescape error handler,
4560 used to decode the command line arguments on Mac OS X. */
4561
4562wchar_t*
4563_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4564{
4565 int n;
4566 const char *e;
4567 wchar_t *unicode, *p;
4568
4569 /* Note: size will always be longer than the resulting Unicode
4570 character count */
4571 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4572 PyErr_NoMemory();
4573 return NULL;
4574 }
4575 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4576 if (!unicode)
4577 return NULL;
4578
4579 /* Unpack UTF-8 encoded data */
4580 p = unicode;
4581 e = s + size;
4582 while (s < e) {
4583 Py_UCS4 ch = (unsigned char)*s;
4584
4585 if (ch < 0x80) {
4586 *p++ = (wchar_t)ch;
4587 s++;
4588 continue;
4589 }
4590
4591 n = utf8_code_length[ch];
4592 if (s + n > e) {
4593 goto surrogateescape;
4594 }
4595
4596 switch (n) {
4597 case 0:
4598 case 1:
4599 goto surrogateescape;
4600
4601 case 2:
4602 if ((s[1] & 0xc0) != 0x80)
4603 goto surrogateescape;
4604 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4605 assert ((ch > 0x007F) && (ch <= 0x07FF));
4606 *p++ = (wchar_t)ch;
4607 break;
4608
4609 case 3:
4610 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4611 will result in surrogates in range d800-dfff. Surrogates are
4612 not valid UTF-8 so they are rejected.
4613 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4614 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4615 if ((s[1] & 0xc0) != 0x80 ||
4616 (s[2] & 0xc0) != 0x80 ||
4617 ((unsigned char)s[0] == 0xE0 &&
4618 (unsigned char)s[1] < 0xA0) ||
4619 ((unsigned char)s[0] == 0xED &&
4620 (unsigned char)s[1] > 0x9F)) {
4621
4622 goto surrogateescape;
4623 }
4624 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4625 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004626 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004627 break;
4628
4629 case 4:
4630 if ((s[1] & 0xc0) != 0x80 ||
4631 (s[2] & 0xc0) != 0x80 ||
4632 (s[3] & 0xc0) != 0x80 ||
4633 ((unsigned char)s[0] == 0xF0 &&
4634 (unsigned char)s[1] < 0x90) ||
4635 ((unsigned char)s[0] == 0xF4 &&
4636 (unsigned char)s[1] > 0x8F)) {
4637 goto surrogateescape;
4638 }
4639 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4640 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4641 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4642
4643#if SIZEOF_WCHAR_T == 4
4644 *p++ = (wchar_t)ch;
4645#else
4646 /* compute and append the two surrogates: */
4647
4648 /* translate from 10000..10FFFF to 0..FFFF */
4649 ch -= 0x10000;
4650
4651 /* high surrogate = top 10 bits added to D800 */
4652 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4653
4654 /* low surrogate = bottom 10 bits added to DC00 */
4655 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4656#endif
4657 break;
4658 }
4659 s += n;
4660 continue;
4661
4662 surrogateescape:
4663 *p++ = 0xDC00 + ch;
4664 s++;
4665 }
4666 *p = L'\0';
4667 return unicode;
4668}
4669
4670#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004671
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004672/* Primary internal function which creates utf8 encoded bytes objects.
4673
4674 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004675 and allocate exactly as much space needed at the end. Else allocate the
4676 maximum possible needed (4 result bytes per Unicode character), and return
4677 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004678*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004679PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681{
Tim Peters602f7402002-04-27 18:03:26 +00004682#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004683
Guido van Rossum98297ee2007-11-06 21:34:58 +00004684 Py_ssize_t i; /* index into s of next input byte */
4685 PyObject *result; /* result string object */
4686 char *p; /* next free byte in output buffer */
4687 Py_ssize_t nallocated; /* number of result bytes allocated */
4688 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004689 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004690 PyObject *errorHandler = NULL;
4691 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004692 int kind;
4693 void *data;
4694 Py_ssize_t size;
4695 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4696#if SIZEOF_WCHAR_T == 2
4697 Py_ssize_t wchar_offset = 0;
4698#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004700 if (!PyUnicode_Check(unicode)) {
4701 PyErr_BadArgument();
4702 return NULL;
4703 }
4704
4705 if (PyUnicode_READY(unicode) == -1)
4706 return NULL;
4707
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004708 if (PyUnicode_UTF8(unicode))
4709 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4710 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004711
4712 kind = PyUnicode_KIND(unicode);
4713 data = PyUnicode_DATA(unicode);
4714 size = PyUnicode_GET_LENGTH(unicode);
4715
Tim Peters602f7402002-04-27 18:03:26 +00004716 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717
Tim Peters602f7402002-04-27 18:03:26 +00004718 if (size <= MAX_SHORT_UNICHARS) {
4719 /* Write into the stack buffer; nallocated can't overflow.
4720 * At the end, we'll allocate exactly as much heap space as it
4721 * turns out we need.
4722 */
4723 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004724 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004725 p = stackbuf;
4726 }
4727 else {
4728 /* Overallocate on the heap, and give the excess back at the end. */
4729 nallocated = size * 4;
4730 if (nallocated / 4 != size) /* overflow! */
4731 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004732 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004733 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004734 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004735 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004736 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004737
Tim Peters602f7402002-04-27 18:03:26 +00004738 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004739 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004740
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004741 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004742 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004744
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004746 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004747 *p++ = (char)(0xc0 | (ch >> 6));
4748 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004749 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750 Py_ssize_t newpos;
4751 PyObject *rep;
4752 Py_ssize_t repsize, k, startpos;
4753 startpos = i-1;
4754#if SIZEOF_WCHAR_T == 2
4755 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004756#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004757 rep = unicode_encode_call_errorhandler(
4758 errors, &errorHandler, "utf-8", "surrogates not allowed",
4759 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4760 &exc, startpos, startpos+1, &newpos);
4761 if (!rep)
4762 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004763
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004764 if (PyBytes_Check(rep))
4765 repsize = PyBytes_GET_SIZE(rep);
4766 else
4767 repsize = PyUnicode_GET_SIZE(rep);
4768
4769 if (repsize > 4) {
4770 Py_ssize_t offset;
4771
4772 if (result == NULL)
4773 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004774 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004775 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4778 /* integer overflow */
4779 PyErr_NoMemory();
4780 goto error;
4781 }
4782 nallocated += repsize - 4;
4783 if (result != NULL) {
4784 if (_PyBytes_Resize(&result, nallocated) < 0)
4785 goto error;
4786 } else {
4787 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004788 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004789 goto error;
4790 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4791 }
4792 p = PyBytes_AS_STRING(result) + offset;
4793 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004794
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 if (PyBytes_Check(rep)) {
4796 char *prep = PyBytes_AS_STRING(rep);
4797 for(k = repsize; k > 0; k--)
4798 *p++ = *prep++;
4799 } else /* rep is unicode */ {
4800 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4801 Py_UNICODE c;
4802
4803 for(k=0; k<repsize; k++) {
4804 c = prep[k];
4805 if (0x80 <= c) {
4806 raise_encode_exception(&exc, "utf-8",
4807 PyUnicode_AS_UNICODE(unicode),
4808 size, i-1, i,
4809 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004810 goto error;
4811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004812 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004813 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004814 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004816 } else if (ch < 0x10000) {
4817 *p++ = (char)(0xe0 | (ch >> 12));
4818 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4819 *p++ = (char)(0x80 | (ch & 0x3f));
4820 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004821 /* Encode UCS4 Unicode ordinals */
4822 *p++ = (char)(0xf0 | (ch >> 18));
4823 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4824 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4825 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004826#if SIZEOF_WCHAR_T == 2
4827 wchar_offset++;
4828#endif
Tim Peters602f7402002-04-27 18:03:26 +00004829 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004831
Guido van Rossum98297ee2007-11-06 21:34:58 +00004832 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004833 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004834 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004835 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004836 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004837 }
4838 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004839 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004840 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004841 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004842 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004843 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004844
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004845 Py_XDECREF(errorHandler);
4846 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004847 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004848 error:
4849 Py_XDECREF(errorHandler);
4850 Py_XDECREF(exc);
4851 Py_XDECREF(result);
4852 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004853
Tim Peters602f7402002-04-27 18:03:26 +00004854#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855}
4856
Alexander Belopolsky40018472011-02-26 01:02:56 +00004857PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004858PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4859 Py_ssize_t size,
4860 const char *errors)
4861{
4862 PyObject *v, *unicode;
4863
4864 unicode = PyUnicode_FromUnicode(s, size);
4865 if (unicode == NULL)
4866 return NULL;
4867 v = _PyUnicode_AsUTF8String(unicode, errors);
4868 Py_DECREF(unicode);
4869 return v;
4870}
4871
4872PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004873PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004875 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876}
4877
Walter Dörwald41980ca2007-08-16 21:55:45 +00004878/* --- UTF-32 Codec ------------------------------------------------------- */
4879
4880PyObject *
4881PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004882 Py_ssize_t size,
4883 const char *errors,
4884 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004885{
4886 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4887}
4888
4889PyObject *
4890PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 Py_ssize_t size,
4892 const char *errors,
4893 int *byteorder,
4894 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004895{
4896 const char *starts = s;
4897 Py_ssize_t startinpos;
4898 Py_ssize_t endinpos;
4899 Py_ssize_t outpos;
4900 PyUnicodeObject *unicode;
4901 Py_UNICODE *p;
4902#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004903 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004904 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004905#else
4906 const int pairs = 0;
4907#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004908 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004909 int bo = 0; /* assume native ordering by default */
4910 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911 /* Offsets from q for retrieving bytes in the right order. */
4912#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4913 int iorder[] = {0, 1, 2, 3};
4914#else
4915 int iorder[] = {3, 2, 1, 0};
4916#endif
4917 PyObject *errorHandler = NULL;
4918 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004919
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920 q = (unsigned char *)s;
4921 e = q + size;
4922
4923 if (byteorder)
4924 bo = *byteorder;
4925
4926 /* Check for BOM marks (U+FEFF) in the input and adjust current
4927 byte order setting accordingly. In native mode, the leading BOM
4928 mark is skipped, in all other modes, it is copied to the output
4929 stream as-is (giving a ZWNBSP character). */
4930 if (bo == 0) {
4931 if (size >= 4) {
4932 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004934#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 if (bom == 0x0000FEFF) {
4936 q += 4;
4937 bo = -1;
4938 }
4939 else if (bom == 0xFFFE0000) {
4940 q += 4;
4941 bo = 1;
4942 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004944 if (bom == 0x0000FEFF) {
4945 q += 4;
4946 bo = 1;
4947 }
4948 else if (bom == 0xFFFE0000) {
4949 q += 4;
4950 bo = -1;
4951 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004953 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954 }
4955
4956 if (bo == -1) {
4957 /* force LE */
4958 iorder[0] = 0;
4959 iorder[1] = 1;
4960 iorder[2] = 2;
4961 iorder[3] = 3;
4962 }
4963 else if (bo == 1) {
4964 /* force BE */
4965 iorder[0] = 3;
4966 iorder[1] = 2;
4967 iorder[2] = 1;
4968 iorder[3] = 0;
4969 }
4970
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004971 /* On narrow builds we split characters outside the BMP into two
4972 codepoints => count how much extra space we need. */
4973#ifndef Py_UNICODE_WIDE
4974 for (qq = q; qq < e; qq += 4)
4975 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4976 pairs++;
4977#endif
4978
4979 /* This might be one to much, because of a BOM */
4980 unicode = _PyUnicode_New((size+3)/4+pairs);
4981 if (!unicode)
4982 return NULL;
4983 if (size == 0)
4984 return (PyObject *)unicode;
4985
4986 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004987 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004988
Walter Dörwald41980ca2007-08-16 21:55:45 +00004989 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004990 Py_UCS4 ch;
4991 /* remaining bytes at the end? (size should be divisible by 4) */
4992 if (e-q<4) {
4993 if (consumed)
4994 break;
4995 errmsg = "truncated data";
4996 startinpos = ((const char *)q)-starts;
4997 endinpos = ((const char *)e)-starts;
4998 goto utf32Error;
4999 /* The remaining input chars are ignored if the callback
5000 chooses to skip the input */
5001 }
5002 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5003 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005004
Benjamin Peterson29060642009-01-31 22:14:21 +00005005 if (ch >= 0x110000)
5006 {
5007 errmsg = "codepoint not in range(0x110000)";
5008 startinpos = ((const char *)q)-starts;
5009 endinpos = startinpos+4;
5010 goto utf32Error;
5011 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 if (ch >= 0x10000)
5014 {
5015 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5016 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5017 }
5018 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005019#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005020 *p++ = ch;
5021 q += 4;
5022 continue;
5023 utf32Error:
5024 outpos = p-PyUnicode_AS_UNICODE(unicode);
5025 if (unicode_decode_call_errorhandler(
5026 errors, &errorHandler,
5027 "utf32", errmsg,
5028 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5029 &unicode, &outpos, &p))
5030 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005031 }
5032
5033 if (byteorder)
5034 *byteorder = bo;
5035
5036 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005038
5039 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005040 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 goto onError;
5042
5043 Py_XDECREF(errorHandler);
5044 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005045#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005046 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005047 Py_DECREF(unicode);
5048 return NULL;
5049 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005050#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005051 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052 return (PyObject *)unicode;
5053
Benjamin Peterson29060642009-01-31 22:14:21 +00005054 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055 Py_DECREF(unicode);
5056 Py_XDECREF(errorHandler);
5057 Py_XDECREF(exc);
5058 return NULL;
5059}
5060
5061PyObject *
5062PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005063 Py_ssize_t size,
5064 const char *errors,
5065 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005067 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005069 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005071 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072#else
5073 const int pairs = 0;
5074#endif
5075 /* Offsets from p for storing byte pairs in the right order. */
5076#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5077 int iorder[] = {0, 1, 2, 3};
5078#else
5079 int iorder[] = {3, 2, 1, 0};
5080#endif
5081
Benjamin Peterson29060642009-01-31 22:14:21 +00005082#define STORECHAR(CH) \
5083 do { \
5084 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5085 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5086 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5087 p[iorder[0]] = (CH) & 0xff; \
5088 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089 } while(0)
5090
5091 /* In narrow builds we can output surrogate pairs as one codepoint,
5092 so we need less space. */
5093#ifndef Py_UNICODE_WIDE
5094 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5096 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5097 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005098#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005099 nsize = (size - pairs + (byteorder == 0));
5100 bytesize = nsize * 4;
5101 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005103 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005104 if (v == NULL)
5105 return NULL;
5106
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005107 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005108 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005109 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005111 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112
5113 if (byteorder == -1) {
5114 /* force LE */
5115 iorder[0] = 0;
5116 iorder[1] = 1;
5117 iorder[2] = 2;
5118 iorder[3] = 3;
5119 }
5120 else if (byteorder == 1) {
5121 /* force BE */
5122 iorder[0] = 3;
5123 iorder[1] = 2;
5124 iorder[2] = 1;
5125 iorder[3] = 0;
5126 }
5127
5128 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005129 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005130#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5132 Py_UCS4 ch2 = *s;
5133 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5134 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5135 s++;
5136 size--;
5137 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005138 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005139#endif
5140 STORECHAR(ch);
5141 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005142
5143 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005144 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145#undef STORECHAR
5146}
5147
Alexander Belopolsky40018472011-02-26 01:02:56 +00005148PyObject *
5149PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150{
5151 if (!PyUnicode_Check(unicode)) {
5152 PyErr_BadArgument();
5153 return NULL;
5154 }
5155 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005156 PyUnicode_GET_SIZE(unicode),
5157 NULL,
5158 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005159}
5160
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161/* --- UTF-16 Codec ------------------------------------------------------- */
5162
Tim Peters772747b2001-08-09 22:21:55 +00005163PyObject *
5164PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005165 Py_ssize_t size,
5166 const char *errors,
5167 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168{
Walter Dörwald69652032004-09-07 20:24:22 +00005169 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5170}
5171
Antoine Pitrouab868312009-01-10 15:40:25 +00005172/* Two masks for fast checking of whether a C 'long' may contain
5173 UTF16-encoded surrogate characters. This is an efficient heuristic,
5174 assuming that non-surrogate characters with a code point >= 0x8000 are
5175 rare in most input.
5176 FAST_CHAR_MASK is used when the input is in native byte ordering,
5177 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005178*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005179#if (SIZEOF_LONG == 8)
5180# define FAST_CHAR_MASK 0x8000800080008000L
5181# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5182#elif (SIZEOF_LONG == 4)
5183# define FAST_CHAR_MASK 0x80008000L
5184# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5185#else
5186# error C 'long' size should be either 4 or 8!
5187#endif
5188
Walter Dörwald69652032004-09-07 20:24:22 +00005189PyObject *
5190PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005191 Py_ssize_t size,
5192 const char *errors,
5193 int *byteorder,
5194 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005195{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005196 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005197 Py_ssize_t startinpos;
5198 Py_ssize_t endinpos;
5199 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 PyUnicodeObject *unicode;
5201 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005202 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005203 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005204 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005205 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005206 /* Offsets from q for retrieving byte pairs in the right order. */
5207#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5208 int ihi = 1, ilo = 0;
5209#else
5210 int ihi = 0, ilo = 1;
5211#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005212 PyObject *errorHandler = NULL;
5213 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214
5215 /* Note: size will always be longer than the resulting Unicode
5216 character count */
5217 unicode = _PyUnicode_New(size);
5218 if (!unicode)
5219 return NULL;
5220 if (size == 0)
5221 return (PyObject *)unicode;
5222
5223 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005224 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005225 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005226 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005229 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005231 /* Check for BOM marks (U+FEFF) in the input and adjust current
5232 byte order setting accordingly. In native mode, the leading BOM
5233 mark is skipped, in all other modes, it is copied to the output
5234 stream as-is (giving a ZWNBSP character). */
5235 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005236 if (size >= 2) {
5237 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005238#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005239 if (bom == 0xFEFF) {
5240 q += 2;
5241 bo = -1;
5242 }
5243 else if (bom == 0xFFFE) {
5244 q += 2;
5245 bo = 1;
5246 }
Tim Petersced69f82003-09-16 20:30:58 +00005247#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005248 if (bom == 0xFEFF) {
5249 q += 2;
5250 bo = 1;
5251 }
5252 else if (bom == 0xFFFE) {
5253 q += 2;
5254 bo = -1;
5255 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005257 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259
Tim Peters772747b2001-08-09 22:21:55 +00005260 if (bo == -1) {
5261 /* force LE */
5262 ihi = 1;
5263 ilo = 0;
5264 }
5265 else if (bo == 1) {
5266 /* force BE */
5267 ihi = 0;
5268 ilo = 1;
5269 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005270#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5271 native_ordering = ilo < ihi;
5272#else
5273 native_ordering = ilo > ihi;
5274#endif
Tim Peters772747b2001-08-09 22:21:55 +00005275
Antoine Pitrouab868312009-01-10 15:40:25 +00005276 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005277 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005278 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005279 /* First check for possible aligned read of a C 'long'. Unaligned
5280 reads are more expensive, better to defer to another iteration. */
5281 if (!((size_t) q & LONG_PTR_MASK)) {
5282 /* Fast path for runs of non-surrogate chars. */
5283 register const unsigned char *_q = q;
5284 Py_UNICODE *_p = p;
5285 if (native_ordering) {
5286 /* Native ordering is simple: as long as the input cannot
5287 possibly contain a surrogate char, do an unrolled copy
5288 of several 16-bit code points to the target object.
5289 The non-surrogate check is done on several input bytes
5290 at a time (as many as a C 'long' can contain). */
5291 while (_q < aligned_end) {
5292 unsigned long data = * (unsigned long *) _q;
5293 if (data & FAST_CHAR_MASK)
5294 break;
5295 _p[0] = ((unsigned short *) _q)[0];
5296 _p[1] = ((unsigned short *) _q)[1];
5297#if (SIZEOF_LONG == 8)
5298 _p[2] = ((unsigned short *) _q)[2];
5299 _p[3] = ((unsigned short *) _q)[3];
5300#endif
5301 _q += SIZEOF_LONG;
5302 _p += SIZEOF_LONG / 2;
5303 }
5304 }
5305 else {
5306 /* Byteswapped ordering is similar, but we must decompose
5307 the copy bytewise, and take care of zero'ing out the
5308 upper bytes if the target object is in 32-bit units
5309 (that is, in UCS-4 builds). */
5310 while (_q < aligned_end) {
5311 unsigned long data = * (unsigned long *) _q;
5312 if (data & SWAPPED_FAST_CHAR_MASK)
5313 break;
5314 /* Zero upper bytes in UCS-4 builds */
5315#if (Py_UNICODE_SIZE > 2)
5316 _p[0] = 0;
5317 _p[1] = 0;
5318#if (SIZEOF_LONG == 8)
5319 _p[2] = 0;
5320 _p[3] = 0;
5321#endif
5322#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005323 /* Issue #4916; UCS-4 builds on big endian machines must
5324 fill the two last bytes of each 4-byte unit. */
5325#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5326# define OFF 2
5327#else
5328# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005329#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005330 ((unsigned char *) _p)[OFF + 1] = _q[0];
5331 ((unsigned char *) _p)[OFF + 0] = _q[1];
5332 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5333 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5334#if (SIZEOF_LONG == 8)
5335 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5336 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5337 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5338 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5339#endif
5340#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005341 _q += SIZEOF_LONG;
5342 _p += SIZEOF_LONG / 2;
5343 }
5344 }
5345 p = _p;
5346 q = _q;
5347 if (q >= e)
5348 break;
5349 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005350 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005351
Benjamin Peterson14339b62009-01-31 16:36:08 +00005352 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005353
5354 if (ch < 0xD800 || ch > 0xDFFF) {
5355 *p++ = ch;
5356 continue;
5357 }
5358
5359 /* UTF-16 code pair: */
5360 if (q > e) {
5361 errmsg = "unexpected end of data";
5362 startinpos = (((const char *)q) - 2) - starts;
5363 endinpos = ((const char *)e) + 1 - starts;
5364 goto utf16Error;
5365 }
5366 if (0xD800 <= ch && ch <= 0xDBFF) {
5367 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5368 q += 2;
5369 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005370#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 *p++ = ch;
5372 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005373#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005375#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 continue;
5377 }
5378 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005379 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005380 startinpos = (((const char *)q)-4)-starts;
5381 endinpos = startinpos+2;
5382 goto utf16Error;
5383 }
5384
Benjamin Peterson14339b62009-01-31 16:36:08 +00005385 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 errmsg = "illegal encoding";
5387 startinpos = (((const char *)q)-2)-starts;
5388 endinpos = startinpos+2;
5389 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005390
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 utf16Error:
5392 outpos = p - PyUnicode_AS_UNICODE(unicode);
5393 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 errors,
5395 &errorHandler,
5396 "utf16", errmsg,
5397 &starts,
5398 (const char **)&e,
5399 &startinpos,
5400 &endinpos,
5401 &exc,
5402 (const char **)&q,
5403 &unicode,
5404 &outpos,
5405 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005406 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005408 /* remaining byte at the end? (size should be even) */
5409 if (e == q) {
5410 if (!consumed) {
5411 errmsg = "truncated data";
5412 startinpos = ((const char *)q) - starts;
5413 endinpos = ((const char *)e) + 1 - starts;
5414 outpos = p - PyUnicode_AS_UNICODE(unicode);
5415 if (unicode_decode_call_errorhandler(
5416 errors,
5417 &errorHandler,
5418 "utf16", errmsg,
5419 &starts,
5420 (const char **)&e,
5421 &startinpos,
5422 &endinpos,
5423 &exc,
5424 (const char **)&q,
5425 &unicode,
5426 &outpos,
5427 &p))
5428 goto onError;
5429 /* The remaining input chars are ignored if the callback
5430 chooses to skip the input */
5431 }
5432 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433
5434 if (byteorder)
5435 *byteorder = bo;
5436
Walter Dörwald69652032004-09-07 20:24:22 +00005437 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005438 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005439
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005441 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 goto onError;
5443
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005444 Py_XDECREF(errorHandler);
5445 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005446#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005447 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005448 Py_DECREF(unicode);
5449 return NULL;
5450 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005451#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005452 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 return (PyObject *)unicode;
5454
Benjamin Peterson29060642009-01-31 22:14:21 +00005455 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005457 Py_XDECREF(errorHandler);
5458 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 return NULL;
5460}
5461
Antoine Pitrouab868312009-01-10 15:40:25 +00005462#undef FAST_CHAR_MASK
5463#undef SWAPPED_FAST_CHAR_MASK
5464
Tim Peters772747b2001-08-09 22:21:55 +00005465PyObject *
5466PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005467 Py_ssize_t size,
5468 const char *errors,
5469 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005471 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005472 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005473 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005474#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005475 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005476#else
5477 const int pairs = 0;
5478#endif
Tim Peters772747b2001-08-09 22:21:55 +00005479 /* Offsets from p for storing byte pairs in the right order. */
5480#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5481 int ihi = 1, ilo = 0;
5482#else
5483 int ihi = 0, ilo = 1;
5484#endif
5485
Benjamin Peterson29060642009-01-31 22:14:21 +00005486#define STORECHAR(CH) \
5487 do { \
5488 p[ihi] = ((CH) >> 8) & 0xff; \
5489 p[ilo] = (CH) & 0xff; \
5490 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005491 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005493#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005494 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005495 if (s[i] >= 0x10000)
5496 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005497#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005498 /* 2 * (size + pairs + (byteorder == 0)) */
5499 if (size > PY_SSIZE_T_MAX ||
5500 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005502 nsize = size + pairs + (byteorder == 0);
5503 bytesize = nsize * 2;
5504 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005506 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (v == NULL)
5508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005510 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005513 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005514 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005515
5516 if (byteorder == -1) {
5517 /* force LE */
5518 ihi = 1;
5519 ilo = 0;
5520 }
5521 else if (byteorder == 1) {
5522 /* force BE */
5523 ihi = 0;
5524 ilo = 1;
5525 }
5526
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005527 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005528 Py_UNICODE ch = *s++;
5529 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005530#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 if (ch >= 0x10000) {
5532 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5533 ch = 0xD800 | ((ch-0x10000) >> 10);
5534 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005535#endif
Tim Peters772747b2001-08-09 22:21:55 +00005536 STORECHAR(ch);
5537 if (ch2)
5538 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005539 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005540
5541 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005542 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005543#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544}
5545
Alexander Belopolsky40018472011-02-26 01:02:56 +00005546PyObject *
5547PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548{
5549 if (!PyUnicode_Check(unicode)) {
5550 PyErr_BadArgument();
5551 return NULL;
5552 }
5553 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005554 PyUnicode_GET_SIZE(unicode),
5555 NULL,
5556 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557}
5558
5559/* --- Unicode Escape Codec ----------------------------------------------- */
5560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5562 if all the escapes in the string make it still a valid ASCII string.
5563 Returns -1 if any escapes were found which cause the string to
5564 pop out of ASCII range. Otherwise returns the length of the
5565 required buffer to hold the string.
5566 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005567static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005568length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5569{
5570 const unsigned char *p = (const unsigned char *)s;
5571 const unsigned char *end = p + size;
5572 Py_ssize_t length = 0;
5573
5574 if (size < 0)
5575 return -1;
5576
5577 for (; p < end; ++p) {
5578 if (*p > 127) {
5579 /* Non-ASCII */
5580 return -1;
5581 }
5582 else if (*p != '\\') {
5583 /* Normal character */
5584 ++length;
5585 }
5586 else {
5587 /* Backslash-escape, check next char */
5588 ++p;
5589 /* Escape sequence reaches till end of string or
5590 non-ASCII follow-up. */
5591 if (p >= end || *p > 127)
5592 return -1;
5593 switch (*p) {
5594 case '\n':
5595 /* backslash + \n result in zero characters */
5596 break;
5597 case '\\': case '\'': case '\"':
5598 case 'b': case 'f': case 't':
5599 case 'n': case 'r': case 'v': case 'a':
5600 ++length;
5601 break;
5602 case '0': case '1': case '2': case '3':
5603 case '4': case '5': case '6': case '7':
5604 case 'x': case 'u': case 'U': case 'N':
5605 /* these do not guarantee ASCII characters */
5606 return -1;
5607 default:
5608 /* count the backslash + the other character */
5609 length += 2;
5610 }
5611 }
5612 }
5613 return length;
5614}
5615
5616/* Similar to PyUnicode_WRITE but either write into wstr field
5617 or treat string as ASCII. */
5618#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5619 do { \
5620 if ((kind) != PyUnicode_WCHAR_KIND) \
5621 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5622 else \
5623 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5624 } while (0)
5625
5626#define WRITE_WSTR(buf, index, value) \
5627 assert(kind == PyUnicode_WCHAR_KIND), \
5628 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5629
5630
Fredrik Lundh06d12682001-01-24 07:59:11 +00005631static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005632
Alexander Belopolsky40018472011-02-26 01:02:56 +00005633PyObject *
5634PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005635 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005636 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005638 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005639 Py_ssize_t startinpos;
5640 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005641 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005645 char* message;
5646 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005647 PyObject *errorHandler = NULL;
5648 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005649 Py_ssize_t ascii_length;
5650 Py_ssize_t i;
5651 int kind;
5652 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 ascii_length = length_of_escaped_ascii_string(s, size);
5655
5656 /* After length_of_escaped_ascii_string() there are two alternatives,
5657 either the string is pure ASCII with named escapes like \n, etc.
5658 and we determined it's exact size (common case)
5659 or it contains \x, \u, ... escape sequences. then we create a
5660 legacy wchar string and resize it at the end of this function. */
5661 if (ascii_length >= 0) {
5662 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5663 if (!v)
5664 goto onError;
5665 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5666 kind = PyUnicode_1BYTE_KIND;
5667 data = PyUnicode_DATA(v);
5668 }
5669 else {
5670 /* Escaped strings will always be longer than the resulting
5671 Unicode string, so we start with size here and then reduce the
5672 length after conversion to the true value.
5673 (but if the error callback returns a long replacement string
5674 we'll have to allocate more space) */
5675 v = _PyUnicode_New(size);
5676 if (!v)
5677 goto onError;
5678 kind = PyUnicode_WCHAR_KIND;
5679 data = PyUnicode_AS_UNICODE(v);
5680 }
5681
Guido van Rossumd57fd912000-03-10 22:53:23 +00005682 if (size == 0)
5683 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005684 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 while (s < end) {
5688 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005689 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005690 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005692 if (kind == PyUnicode_WCHAR_KIND) {
5693 assert(i < _PyUnicode_WSTR_LENGTH(v));
5694 }
5695 else {
5696 /* The only case in which i == ascii_length is a backslash
5697 followed by a newline. */
5698 assert(i <= ascii_length);
5699 }
5700
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 /* Non-escape characters are interpreted as Unicode ordinals */
5702 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005703 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 continue;
5705 }
5706
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005707 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 /* \ - Escapes */
5709 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005710 c = *s++;
5711 if (s > end)
5712 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005713
5714 if (kind == PyUnicode_WCHAR_KIND) {
5715 assert(i < _PyUnicode_WSTR_LENGTH(v));
5716 }
5717 else {
5718 /* The only case in which i == ascii_length is a backslash
5719 followed by a newline. */
5720 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5721 }
5722
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005723 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005727 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5728 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5729 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5730 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5731 /* FF */
5732 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5733 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5734 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5735 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5736 /* VT */
5737 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5738 /* BEL, not classic C */
5739 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740
Benjamin Peterson29060642009-01-31 22:14:21 +00005741 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 case '0': case '1': case '2': case '3':
5743 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005744 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005745 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005746 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005747 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005748 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005750 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 break;
5752
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 /* hex escapes */
5754 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005756 digits = 2;
5757 message = "truncated \\xXX escape";
5758 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Benjamin Peterson29060642009-01-31 22:14:21 +00005760 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005762 digits = 4;
5763 message = "truncated \\uXXXX escape";
5764 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005768 digits = 8;
5769 message = "truncated \\UXXXXXXXX escape";
5770 hexescape:
5771 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 if (s+digits>end) {
5774 endinpos = size;
5775 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005776 errors, &errorHandler,
5777 "unicodeescape", "end of string in escape sequence",
5778 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005780 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 goto nextByte;
5783 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 for (j = 0; j < digits; ++j) {
5785 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005786 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005787 endinpos = (s+j+1)-starts;
5788 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005790 errors, &errorHandler,
5791 "unicodeescape", message,
5792 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005793 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005794 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005796 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005797 }
5798 chr = (chr<<4) & ~0xF;
5799 if (c >= '0' && c <= '9')
5800 chr += c - '0';
5801 else if (c >= 'a' && c <= 'f')
5802 chr += 10 + c - 'a';
5803 else
5804 chr += 10 + c - 'A';
5805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005806 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005807 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005808 /* _decoding_error will have already written into the
5809 target buffer. */
5810 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005811 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005812 /* when we get here, chr is a 32-bit unicode character */
5813 if (chr <= 0xffff)
5814 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005815 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005816 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005817 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005818 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005819#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005821#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005822 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005823 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5824 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005825#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005826 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005827 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005830 errors, &errorHandler,
5831 "unicodeescape", "illegal Unicode character",
5832 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005833 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005834 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005836 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005837 break;
5838
Benjamin Peterson29060642009-01-31 22:14:21 +00005839 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005840 case 'N':
5841 message = "malformed \\N character escape";
5842 if (ucnhash_CAPI == NULL) {
5843 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005844 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5845 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005846 if (ucnhash_CAPI == NULL)
5847 goto ucnhashError;
5848 }
5849 if (*s == '{') {
5850 const char *start = s+1;
5851 /* look for the closing brace */
5852 while (*s != '}' && s < end)
5853 s++;
5854 if (s > start && s < end && *s == '}') {
5855 /* found a name. look it up in the unicode database */
5856 message = "unknown Unicode character name";
5857 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005858 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5859 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005860 goto store;
5861 }
5862 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005863 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005864 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 errors, &errorHandler,
5867 "unicodeescape", message,
5868 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005869 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005870 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005871 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005872 break;
5873
5874 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005875 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005877 message = "\\ at end of string";
5878 s--;
5879 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005880 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005881 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005882 errors, &errorHandler,
5883 "unicodeescape", message,
5884 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005885 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005886 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005888 }
5889 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005890 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5891 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005892 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005893 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005896 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005898 /* Ensure the length prediction worked in case of ASCII strings */
5899 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5900
Victor Stinnerfe226c02011-10-03 03:52:20 +02005901 if (kind == PyUnicode_WCHAR_KIND)
5902 {
5903 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5904 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005905 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005906 Py_XDECREF(errorHandler);
5907 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005908#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005909 if (_PyUnicode_READY_REPLACE(&v)) {
5910 Py_DECREF(v);
5911 return NULL;
5912 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005913#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005914 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005915 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005916
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005918 PyErr_SetString(
5919 PyExc_UnicodeError,
5920 "\\N escapes not supported (can't load unicodedata module)"
5921 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005922 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005923 Py_XDECREF(errorHandler);
5924 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005925 return NULL;
5926
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005929 Py_XDECREF(errorHandler);
5930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 return NULL;
5932}
5933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005934#undef WRITE_ASCII_OR_WSTR
5935#undef WRITE_WSTR
5936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937/* Return a Unicode-Escape string version of the Unicode object.
5938
5939 If quotes is true, the string is enclosed in u"" or u'' quotes as
5940 appropriate.
5941
5942*/
5943
Alexander Belopolsky40018472011-02-26 01:02:56 +00005944PyObject *
5945PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005946 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005947{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005948 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005951#ifdef Py_UNICODE_WIDE
5952 const Py_ssize_t expandsize = 10;
5953#else
5954 const Py_ssize_t expandsize = 6;
5955#endif
5956
Thomas Wouters89f507f2006-12-13 04:49:30 +00005957 /* XXX(nnorwitz): rather than over-allocating, it would be
5958 better to choose a different scheme. Perhaps scan the
5959 first N-chars of the string and allocate based on that size.
5960 */
5961 /* Initial allocation is based on the longest-possible unichr
5962 escape.
5963
5964 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5965 unichr, so in this case it's the longest unichr escape. In
5966 narrow (UTF-16) builds this is five chars per source unichr
5967 since there are two unichrs in the surrogate pair, so in narrow
5968 (UTF-16) builds it's not the longest unichr escape.
5969
5970 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5971 so in the narrow (UTF-16) build case it's the longest unichr
5972 escape.
5973 */
5974
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005975 if (size == 0)
5976 return PyBytes_FromStringAndSize(NULL, 0);
5977
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005978 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005979 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005980
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005981 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005982 2
5983 + expandsize*size
5984 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 if (repr == NULL)
5986 return NULL;
5987
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005988 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 while (size-- > 0) {
5991 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005992
Walter Dörwald79e913e2007-05-12 11:08:06 +00005993 /* Escape backslashes */
5994 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 *p++ = '\\';
5996 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005997 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005999
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00006000#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006001 /* Map 21-bit characters to '\U00xxxxxx' */
6002 else if (ch >= 0x10000) {
6003 *p++ = '\\';
6004 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006005 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6006 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6007 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6008 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6009 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6010 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6011 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6012 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006014 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006015#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6017 else if (ch >= 0xD800 && ch < 0xDC00) {
6018 Py_UNICODE ch2;
6019 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006020
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 ch2 = *s++;
6022 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006023 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006024 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6025 *p++ = '\\';
6026 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006027 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6028 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6029 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6030 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6031 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6032 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6033 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6034 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 continue;
6036 }
6037 /* Fall through: isolated surrogates are copied as-is */
6038 s--;
6039 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006040 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006041#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006042
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006044 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 *p++ = '\\';
6046 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006047 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6048 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6049 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6050 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006052
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006053 /* Map special whitespace to '\t', \n', '\r' */
6054 else if (ch == '\t') {
6055 *p++ = '\\';
6056 *p++ = 't';
6057 }
6058 else if (ch == '\n') {
6059 *p++ = '\\';
6060 *p++ = 'n';
6061 }
6062 else if (ch == '\r') {
6063 *p++ = '\\';
6064 *p++ = 'r';
6065 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006066
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006067 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006068 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006070 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006071 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6072 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006074
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075 /* Copy everything else as-is */
6076 else
6077 *p++ = (char) ch;
6078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006080 assert(p - PyBytes_AS_STRING(repr) > 0);
6081 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6082 return NULL;
6083 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084}
6085
Alexander Belopolsky40018472011-02-26 01:02:56 +00006086PyObject *
6087PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006089 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 if (!PyUnicode_Check(unicode)) {
6091 PyErr_BadArgument();
6092 return NULL;
6093 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006094 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6095 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006096 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097}
6098
6099/* --- Raw Unicode Escape Codec ------------------------------------------- */
6100
Alexander Belopolsky40018472011-02-26 01:02:56 +00006101PyObject *
6102PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006103 Py_ssize_t size,
6104 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006107 Py_ssize_t startinpos;
6108 Py_ssize_t endinpos;
6109 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 const char *end;
6113 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 PyObject *errorHandler = NULL;
6115 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006116
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 /* Escaped strings will always be longer than the resulting
6118 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 length after conversion to the true value. (But decoding error
6120 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 v = _PyUnicode_New(size);
6122 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006123 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006125 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 end = s + size;
6128 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 unsigned char c;
6130 Py_UCS4 x;
6131 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006132 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 /* Non-escape characters are interpreted as Unicode ordinals */
6135 if (*s != '\\') {
6136 *p++ = (unsigned char)*s++;
6137 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 startinpos = s-starts;
6140
6141 /* \u-escapes are only interpreted iff the number of leading
6142 backslashes if odd */
6143 bs = s;
6144 for (;s < end;) {
6145 if (*s != '\\')
6146 break;
6147 *p++ = (unsigned char)*s++;
6148 }
6149 if (((s - bs) & 1) == 0 ||
6150 s >= end ||
6151 (*s != 'u' && *s != 'U')) {
6152 continue;
6153 }
6154 p--;
6155 count = *s=='u' ? 4 : 8;
6156 s++;
6157
6158 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6159 outpos = p-PyUnicode_AS_UNICODE(v);
6160 for (x = 0, i = 0; i < count; ++i, ++s) {
6161 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006162 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 endinpos = s-starts;
6164 if (unicode_decode_call_errorhandler(
6165 errors, &errorHandler,
6166 "rawunicodeescape", "truncated \\uXXXX",
6167 &starts, &end, &startinpos, &endinpos, &exc, &s,
6168 &v, &outpos, &p))
6169 goto onError;
6170 goto nextByte;
6171 }
6172 x = (x<<4) & ~0xF;
6173 if (c >= '0' && c <= '9')
6174 x += c - '0';
6175 else if (c >= 'a' && c <= 'f')
6176 x += 10 + c - 'a';
6177 else
6178 x += 10 + c - 'A';
6179 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006180 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 /* UCS-2 character */
6182 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006183 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 /* UCS-4 character. Either store directly, or as
6185 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006186#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006188#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 x -= 0x10000L;
6190 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6191 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006192#endif
6193 } else {
6194 endinpos = s-starts;
6195 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006196 if (unicode_decode_call_errorhandler(
6197 errors, &errorHandler,
6198 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006199 &starts, &end, &startinpos, &endinpos, &exc, &s,
6200 &v, &outpos, &p))
6201 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006202 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 nextByte:
6204 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006206 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_XDECREF(errorHandler);
6209 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006210#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006211 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006212 Py_DECREF(v);
6213 return NULL;
6214 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006215#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006216 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006218
Benjamin Peterson29060642009-01-31 22:14:21 +00006219 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006221 Py_XDECREF(errorHandler);
6222 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223 return NULL;
6224}
6225
Alexander Belopolsky40018472011-02-26 01:02:56 +00006226PyObject *
6227PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006228 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006230 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006231 char *p;
6232 char *q;
6233
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006234#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006235 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006236#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006237 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006238#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006239
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006240 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006242
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006243 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 if (repr == NULL)
6245 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006246 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006247 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006249 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 while (size-- > 0) {
6251 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006252#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 /* Map 32-bit characters to '\Uxxxxxxxx' */
6254 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006255 *p++ = '\\';
6256 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006257 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6258 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6259 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6260 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6261 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6262 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6263 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6264 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006265 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006266 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006267#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6269 if (ch >= 0xD800 && ch < 0xDC00) {
6270 Py_UNICODE ch2;
6271 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006272
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 ch2 = *s++;
6274 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006275 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006276 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6277 *p++ = '\\';
6278 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006279 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6280 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6281 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6282 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6283 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6284 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6285 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6286 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006287 continue;
6288 }
6289 /* Fall through: isolated surrogates are copied as-is */
6290 s--;
6291 size++;
6292 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006293#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 /* Map 16-bit characters to '\uxxxx' */
6295 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296 *p++ = '\\';
6297 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006298 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6299 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6300 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6301 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006303 /* Copy everything else as-is */
6304 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305 *p++ = (char) ch;
6306 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006307 size = p - q;
6308
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006309 assert(size > 0);
6310 if (_PyBytes_Resize(&repr, size) < 0)
6311 return NULL;
6312 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313}
6314
Alexander Belopolsky40018472011-02-26 01:02:56 +00006315PyObject *
6316PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006318 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006320 PyErr_BadArgument();
6321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006323 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6324 PyUnicode_GET_SIZE(unicode));
6325
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006326 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327}
6328
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006329/* --- Unicode Internal Codec ------------------------------------------- */
6330
Alexander Belopolsky40018472011-02-26 01:02:56 +00006331PyObject *
6332_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006333 Py_ssize_t size,
6334 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006335{
6336 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006337 Py_ssize_t startinpos;
6338 Py_ssize_t endinpos;
6339 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006340 PyUnicodeObject *v;
6341 Py_UNICODE *p;
6342 const char *end;
6343 const char *reason;
6344 PyObject *errorHandler = NULL;
6345 PyObject *exc = NULL;
6346
Neal Norwitzd43069c2006-01-08 01:12:10 +00006347#ifdef Py_UNICODE_WIDE
6348 Py_UNICODE unimax = PyUnicode_GetMax();
6349#endif
6350
Thomas Wouters89f507f2006-12-13 04:49:30 +00006351 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006352 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6353 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006355 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6356 as string was created with the old API. */
6357 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006359 p = PyUnicode_AS_UNICODE(v);
6360 end = s + size;
6361
6362 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006363 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006364 /* We have to sanity check the raw data, otherwise doom looms for
6365 some malformed UCS-4 data. */
6366 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006367#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006368 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006369#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006370 end-s < Py_UNICODE_SIZE
6371 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006372 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006373 startinpos = s - starts;
6374 if (end-s < Py_UNICODE_SIZE) {
6375 endinpos = end-starts;
6376 reason = "truncated input";
6377 }
6378 else {
6379 endinpos = s - starts + Py_UNICODE_SIZE;
6380 reason = "illegal code point (> 0x10FFFF)";
6381 }
6382 outpos = p - PyUnicode_AS_UNICODE(v);
6383 if (unicode_decode_call_errorhandler(
6384 errors, &errorHandler,
6385 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006386 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006387 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006388 goto onError;
6389 }
6390 }
6391 else {
6392 p++;
6393 s += Py_UNICODE_SIZE;
6394 }
6395 }
6396
Victor Stinnerfe226c02011-10-03 03:52:20 +02006397 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006398 goto onError;
6399 Py_XDECREF(errorHandler);
6400 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006401#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006402 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006403 Py_DECREF(v);
6404 return NULL;
6405 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006406#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006407 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006408 return (PyObject *)v;
6409
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006411 Py_XDECREF(v);
6412 Py_XDECREF(errorHandler);
6413 Py_XDECREF(exc);
6414 return NULL;
6415}
6416
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417/* --- Latin-1 Codec ------------------------------------------------------ */
6418
Alexander Belopolsky40018472011-02-26 01:02:56 +00006419PyObject *
6420PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006421 Py_ssize_t size,
6422 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006425 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426}
6427
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429static void
6430make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006431 const char *encoding,
6432 const Py_UNICODE *unicode, Py_ssize_t size,
6433 Py_ssize_t startpos, Py_ssize_t endpos,
6434 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 *exceptionObject = PyUnicodeEncodeError_Create(
6438 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 }
6440 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6442 goto onError;
6443 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6444 goto onError;
6445 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6446 goto onError;
6447 return;
6448 onError:
6449 Py_DECREF(*exceptionObject);
6450 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 }
6452}
6453
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006455static void
6456raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006457 const char *encoding,
6458 const Py_UNICODE *unicode, Py_ssize_t size,
6459 Py_ssize_t startpos, Py_ssize_t endpos,
6460 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461{
6462 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006464 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006465 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466}
6467
6468/* error handling callback helper:
6469 build arguments, call the callback and check the arguments,
6470 put the result into newpos and return the replacement string, which
6471 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006472static PyObject *
6473unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006474 PyObject **errorHandler,
6475 const char *encoding, const char *reason,
6476 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6477 Py_ssize_t startpos, Py_ssize_t endpos,
6478 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006479{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006480 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006481
6482 PyObject *restuple;
6483 PyObject *resunicode;
6484
6485 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006487 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006489 }
6490
6491 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495
6496 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006501 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 Py_DECREF(restuple);
6503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006505 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 &resunicode, newpos)) {
6507 Py_DECREF(restuple);
6508 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006509 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006510 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6511 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6512 Py_DECREF(restuple);
6513 return NULL;
6514 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006515 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006516 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006517 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6519 Py_DECREF(restuple);
6520 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006521 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522 Py_INCREF(resunicode);
6523 Py_DECREF(restuple);
6524 return resunicode;
6525}
6526
Alexander Belopolsky40018472011-02-26 01:02:56 +00006527static PyObject *
6528unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006529 Py_ssize_t size,
6530 const char *errors,
6531 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006532{
6533 /* output object */
6534 PyObject *res;
6535 /* pointers to the beginning and end+1 of input */
6536 const Py_UNICODE *startp = p;
6537 const Py_UNICODE *endp = p + size;
6538 /* pointer to the beginning of the unencodable characters */
6539 /* const Py_UNICODE *badp = NULL; */
6540 /* pointer into the output */
6541 char *str;
6542 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006543 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006544 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6545 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006546 PyObject *errorHandler = NULL;
6547 PyObject *exc = NULL;
6548 /* the following variable is used for caching string comparisons
6549 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6550 int known_errorHandler = -1;
6551
6552 /* allocate enough for a simple encoding without
6553 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006554 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006555 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006556 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006557 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006558 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006559 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006560 ressize = size;
6561
6562 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006563 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564
Benjamin Peterson29060642009-01-31 22:14:21 +00006565 /* can we encode this? */
6566 if (c<limit) {
6567 /* no overflow check, because we know that the space is enough */
6568 *str++ = (char)c;
6569 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006570 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 else {
6572 Py_ssize_t unicodepos = p-startp;
6573 Py_ssize_t requiredsize;
6574 PyObject *repunicode;
6575 Py_ssize_t repsize;
6576 Py_ssize_t newpos;
6577 Py_ssize_t respos;
6578 Py_UNICODE *uni2;
6579 /* startpos for collecting unencodable chars */
6580 const Py_UNICODE *collstart = p;
6581 const Py_UNICODE *collend = p;
6582 /* find all unecodable characters */
6583 while ((collend < endp) && ((*collend)>=limit))
6584 ++collend;
6585 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6586 if (known_errorHandler==-1) {
6587 if ((errors==NULL) || (!strcmp(errors, "strict")))
6588 known_errorHandler = 1;
6589 else if (!strcmp(errors, "replace"))
6590 known_errorHandler = 2;
6591 else if (!strcmp(errors, "ignore"))
6592 known_errorHandler = 3;
6593 else if (!strcmp(errors, "xmlcharrefreplace"))
6594 known_errorHandler = 4;
6595 else
6596 known_errorHandler = 0;
6597 }
6598 switch (known_errorHandler) {
6599 case 1: /* strict */
6600 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6601 goto onError;
6602 case 2: /* replace */
6603 while (collstart++<collend)
6604 *str++ = '?'; /* fall through */
6605 case 3: /* ignore */
6606 p = collend;
6607 break;
6608 case 4: /* xmlcharrefreplace */
6609 respos = str - PyBytes_AS_STRING(res);
6610 /* determine replacement size (temporarily (mis)uses p) */
6611 for (p = collstart, repsize = 0; p < collend; ++p) {
6612 if (*p<10)
6613 repsize += 2+1+1;
6614 else if (*p<100)
6615 repsize += 2+2+1;
6616 else if (*p<1000)
6617 repsize += 2+3+1;
6618 else if (*p<10000)
6619 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006620#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006621 else
6622 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006623#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 else if (*p<100000)
6625 repsize += 2+5+1;
6626 else if (*p<1000000)
6627 repsize += 2+6+1;
6628 else
6629 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006630#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006631 }
6632 requiredsize = respos+repsize+(endp-collend);
6633 if (requiredsize > ressize) {
6634 if (requiredsize<2*ressize)
6635 requiredsize = 2*ressize;
6636 if (_PyBytes_Resize(&res, requiredsize))
6637 goto onError;
6638 str = PyBytes_AS_STRING(res) + respos;
6639 ressize = requiredsize;
6640 }
6641 /* generate replacement (temporarily (mis)uses p) */
6642 for (p = collstart; p < collend; ++p) {
6643 str += sprintf(str, "&#%d;", (int)*p);
6644 }
6645 p = collend;
6646 break;
6647 default:
6648 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6649 encoding, reason, startp, size, &exc,
6650 collstart-startp, collend-startp, &newpos);
6651 if (repunicode == NULL)
6652 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006653 if (PyBytes_Check(repunicode)) {
6654 /* Directly copy bytes result to output. */
6655 repsize = PyBytes_Size(repunicode);
6656 if (repsize > 1) {
6657 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006658 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006659 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6660 Py_DECREF(repunicode);
6661 goto onError;
6662 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006663 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006664 ressize += repsize-1;
6665 }
6666 memcpy(str, PyBytes_AsString(repunicode), repsize);
6667 str += repsize;
6668 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006669 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006670 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006671 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006672 /* need more space? (at least enough for what we
6673 have+the replacement+the rest of the string, so
6674 we won't have to check space for encodable characters) */
6675 respos = str - PyBytes_AS_STRING(res);
6676 repsize = PyUnicode_GET_SIZE(repunicode);
6677 requiredsize = respos+repsize+(endp-collend);
6678 if (requiredsize > ressize) {
6679 if (requiredsize<2*ressize)
6680 requiredsize = 2*ressize;
6681 if (_PyBytes_Resize(&res, requiredsize)) {
6682 Py_DECREF(repunicode);
6683 goto onError;
6684 }
6685 str = PyBytes_AS_STRING(res) + respos;
6686 ressize = requiredsize;
6687 }
6688 /* check if there is anything unencodable in the replacement
6689 and copy it to the output */
6690 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6691 c = *uni2;
6692 if (c >= limit) {
6693 raise_encode_exception(&exc, encoding, startp, size,
6694 unicodepos, unicodepos+1, reason);
6695 Py_DECREF(repunicode);
6696 goto onError;
6697 }
6698 *str = (char)c;
6699 }
6700 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006701 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006702 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 }
6704 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006705 /* Resize if we allocated to much */
6706 size = str - PyBytes_AS_STRING(res);
6707 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006708 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006709 if (_PyBytes_Resize(&res, size) < 0)
6710 goto onError;
6711 }
6712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 Py_XDECREF(errorHandler);
6714 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006715 return res;
6716
6717 onError:
6718 Py_XDECREF(res);
6719 Py_XDECREF(errorHandler);
6720 Py_XDECREF(exc);
6721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722}
6723
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724PyObject *
6725PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006726 Py_ssize_t size,
6727 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730}
6731
Alexander Belopolsky40018472011-02-26 01:02:56 +00006732PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006733_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734{
6735 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 PyErr_BadArgument();
6737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006739 if (PyUnicode_READY(unicode) == -1)
6740 return NULL;
6741 /* Fast path: if it is a one-byte string, construct
6742 bytes object directly. */
6743 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6744 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6745 PyUnicode_GET_LENGTH(unicode));
6746 /* Non-Latin-1 characters present. Defer to above function to
6747 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006749 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006750 errors);
6751}
6752
6753PyObject*
6754PyUnicode_AsLatin1String(PyObject *unicode)
6755{
6756 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757}
6758
6759/* --- 7-bit ASCII Codec -------------------------------------------------- */
6760
Alexander Belopolsky40018472011-02-26 01:02:56 +00006761PyObject *
6762PyUnicode_DecodeASCII(const char *s,
6763 Py_ssize_t size,
6764 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006768 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006769 Py_ssize_t startinpos;
6770 Py_ssize_t endinpos;
6771 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006772 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006773 int has_error;
6774 const unsigned char *p = (const unsigned char *)s;
6775 const unsigned char *end = p + size;
6776 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 PyObject *errorHandler = NULL;
6778 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006779
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006781 if (size == 1 && (unsigned char)s[0] < 128)
6782 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783
Victor Stinner702c7342011-10-05 13:50:52 +02006784 has_error = 0;
6785 while (p < end && !has_error) {
6786 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6787 an explanation. */
6788 if (!((size_t) p & LONG_PTR_MASK)) {
6789 /* Help register allocation */
6790 register const unsigned char *_p = p;
6791 while (_p < aligned_end) {
6792 unsigned long value = *(unsigned long *) _p;
6793 if (value & ASCII_CHAR_MASK) {
6794 has_error = 1;
6795 break;
6796 }
6797 _p += SIZEOF_LONG;
6798 }
6799 if (_p == end)
6800 break;
6801 if (has_error)
6802 break;
6803 p = _p;
6804 }
6805 if (*p & 0x80) {
6806 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006807 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006808 }
6809 else {
6810 ++p;
6811 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006812 }
Victor Stinner702c7342011-10-05 13:50:52 +02006813 if (!has_error)
6814 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 v = _PyUnicode_New(size);
6817 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006821 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 e = s + size;
6823 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 register unsigned char c = (unsigned char)*s;
6825 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006826 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 ++s;
6828 }
6829 else {
6830 startinpos = s-starts;
6831 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006832 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 if (unicode_decode_call_errorhandler(
6834 errors, &errorHandler,
6835 "ascii", "ordinal not in range(128)",
6836 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006837 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 goto onError;
6839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 }
Victor Stinner702c7342011-10-05 13:50:52 +02006841 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6842 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006844 Py_XDECREF(errorHandler);
6845 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006846#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006847 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006848 Py_DECREF(v);
6849 return NULL;
6850 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006851#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006852 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006853 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006854
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 Py_XDECREF(errorHandler);
6858 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006859 return NULL;
6860}
6861
Alexander Belopolsky40018472011-02-26 01:02:56 +00006862PyObject *
6863PyUnicode_EncodeASCII(const Py_UNICODE *p,
6864 Py_ssize_t size,
6865 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006867 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868}
6869
Alexander Belopolsky40018472011-02-26 01:02:56 +00006870PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006871_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872{
6873 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 PyErr_BadArgument();
6875 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006877 if (PyUnicode_READY(unicode) == -1)
6878 return NULL;
6879 /* Fast path: if it is an ASCII-only string, construct bytes object
6880 directly. Else defer to above function to raise the exception. */
6881 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6882 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6883 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006886 errors);
6887}
6888
6889PyObject *
6890PyUnicode_AsASCIIString(PyObject *unicode)
6891{
6892 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006893}
6894
Victor Stinner99b95382011-07-04 14:23:54 +02006895#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006896
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006897/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006898
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006899#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006900#define NEED_RETRY
6901#endif
6902
Victor Stinner3a50e702011-10-18 21:21:00 +02006903#ifndef WC_ERR_INVALID_CHARS
6904# define WC_ERR_INVALID_CHARS 0x0080
6905#endif
6906
6907static char*
6908code_page_name(UINT code_page, PyObject **obj)
6909{
6910 *obj = NULL;
6911 if (code_page == CP_ACP)
6912 return "mbcs";
6913 if (code_page == CP_UTF7)
6914 return "CP_UTF7";
6915 if (code_page == CP_UTF8)
6916 return "CP_UTF8";
6917
6918 *obj = PyBytes_FromFormat("cp%u", code_page);
6919 if (*obj == NULL)
6920 return NULL;
6921 return PyBytes_AS_STRING(*obj);
6922}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006923
Alexander Belopolsky40018472011-02-26 01:02:56 +00006924static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006925is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006926{
6927 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006928 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006929
Victor Stinner3a50e702011-10-18 21:21:00 +02006930 if (!IsDBCSLeadByteEx(code_page, *curr))
6931 return 0;
6932
6933 prev = CharPrevExA(code_page, s, curr, 0);
6934 if (prev == curr)
6935 return 1;
6936 /* FIXME: This code is limited to "true" double-byte encodings,
6937 as it assumes an incomplete character consists of a single
6938 byte. */
6939 if (curr - prev == 2)
6940 return 1;
6941 if (!IsDBCSLeadByteEx(code_page, *prev))
6942 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006943 return 0;
6944}
6945
Victor Stinner3a50e702011-10-18 21:21:00 +02006946static DWORD
6947decode_code_page_flags(UINT code_page)
6948{
6949 if (code_page == CP_UTF7) {
6950 /* The CP_UTF7 decoder only supports flags=0 */
6951 return 0;
6952 }
6953 else
6954 return MB_ERR_INVALID_CHARS;
6955}
6956
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006958 * Decode a byte string from a Windows code page into unicode object in strict
6959 * mode.
6960 *
6961 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6962 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006964static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006965decode_code_page_strict(UINT code_page,
6966 PyUnicodeObject **v,
6967 const char *in,
6968 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969{
Victor Stinner3a50e702011-10-18 21:21:00 +02006970 const DWORD flags = decode_code_page_flags(code_page);
6971 Py_UNICODE *out;
6972 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006973
6974 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 assert(insize > 0);
6976 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6977 if (outsize <= 0)
6978 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006979
6980 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006981 /* Create unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 *v = _PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 if (*v == NULL)
6984 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006985 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986 }
6987 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006989 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6990 if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006991 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993 }
6994
6995 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6997 if (outsize <= 0)
6998 goto error;
6999 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007000
Victor Stinner3a50e702011-10-18 21:21:00 +02007001error:
7002 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7003 return -2;
7004 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007005 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007006}
7007
Victor Stinner3a50e702011-10-18 21:21:00 +02007008/*
7009 * Decode a byte string from a code page into unicode object with an error
7010 * handler.
7011 *
7012 * Returns consumed size if succeed, or raise a WindowsError or
7013 * UnicodeDecodeError exception and returns -1 on error.
7014 */
7015static int
7016decode_code_page_errors(UINT code_page,
7017 PyUnicodeObject **v,
7018 const char *in,
7019 int size,
7020 const char *errors)
7021{
7022 const char *startin = in;
7023 const char *endin = in + size;
7024 const DWORD flags = decode_code_page_flags(code_page);
7025 /* Ideally, we should get reason from FormatMessage. This is the Windows
7026 2000 English version of the message. */
7027 const char *reason = "No mapping for the Unicode character exists "
7028 "in the target code page.";
7029 /* each step cannot decode more than 1 character, but a character can be
7030 represented as a surrogate pair */
7031 wchar_t buffer[2], *startout, *out;
7032 int insize, outsize;
7033 PyObject *errorHandler = NULL;
7034 PyObject *exc = NULL;
7035 PyObject *encoding_obj = NULL;
7036 char *encoding;
7037 DWORD err;
7038 int ret = -1;
7039
7040 assert(size > 0);
7041
7042 encoding = code_page_name(code_page, &encoding_obj);
7043 if (encoding == NULL)
7044 return -1;
7045
7046 if (errors == NULL || strcmp(errors, "strict") == 0) {
7047 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7048 UnicodeDecodeError. */
7049 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7050 if (exc != NULL) {
7051 PyCodec_StrictErrors(exc);
7052 Py_CLEAR(exc);
7053 }
7054 goto error;
7055 }
7056
7057 if (*v == NULL) {
7058 /* Create unicode object */
7059 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7060 PyErr_NoMemory();
7061 goto error;
7062 }
7063 *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7064 if (*v == NULL)
7065 goto error;
7066 startout = PyUnicode_AS_UNICODE(*v);
7067 }
7068 else {
7069 /* Extend unicode object */
7070 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7071 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7072 PyErr_NoMemory();
7073 goto error;
7074 }
7075 if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7076 goto error;
7077 startout = PyUnicode_AS_UNICODE(*v) + n;
7078 }
7079
7080 /* Decode the byte string character per character */
7081 out = startout;
7082 while (in < endin)
7083 {
7084 /* Decode a character */
7085 insize = 1;
7086 do
7087 {
7088 outsize = MultiByteToWideChar(code_page, flags,
7089 in, insize,
7090 buffer, Py_ARRAY_LENGTH(buffer));
7091 if (outsize > 0)
7092 break;
7093 err = GetLastError();
7094 if (err != ERROR_NO_UNICODE_TRANSLATION
7095 && err != ERROR_INSUFFICIENT_BUFFER)
7096 {
7097 PyErr_SetFromWindowsErr(0);
7098 goto error;
7099 }
7100 insize++;
7101 }
7102 /* 4=maximum length of a UTF-8 sequence */
7103 while (insize <= 4 && (in + insize) <= endin);
7104
7105 if (outsize <= 0) {
7106 Py_ssize_t startinpos, endinpos, outpos;
7107
7108 startinpos = in - startin;
7109 endinpos = startinpos + 1;
7110 outpos = out - PyUnicode_AS_UNICODE(*v);
7111 if (unicode_decode_call_errorhandler(
7112 errors, &errorHandler,
7113 encoding, reason,
7114 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7115 v, &outpos, &out))
7116 {
7117 goto error;
7118 }
7119 }
7120 else {
7121 in += insize;
7122 memcpy(out, buffer, outsize * sizeof(wchar_t));
7123 out += outsize;
7124 }
7125 }
7126
7127 /* write a NUL character at the end */
7128 *out = 0;
7129
7130 /* Extend unicode object */
7131 outsize = out - startout;
7132 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7133 if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7134 goto error;
7135 ret = 0;
7136
7137error:
7138 Py_XDECREF(encoding_obj);
7139 Py_XDECREF(errorHandler);
7140 Py_XDECREF(exc);
7141 return ret;
7142}
7143
7144/*
7145 * Decode a byte string from a Windows code page into unicode object. If
7146 * 'final' is set, converts trailing lead-byte too.
7147 *
7148 * Returns consumed size if succeed, or raise a WindowsError or
7149 * UnicodeDecodeError exception and returns -1 on error.
7150 */
7151static int
7152decode_code_page(UINT code_page,
7153 PyUnicodeObject **v,
7154 const char *s, int size,
7155 int final, const char *errors)
7156{
7157 int done;
7158
7159 /* Skip trailing lead-byte unless 'final' is set */
7160 if (size == 0) {
7161 if (*v == NULL) {
7162 Py_INCREF(unicode_empty);
7163 *v = (PyUnicodeObject*)unicode_empty;
7164 if (*v == NULL)
7165 return -1;
7166 }
7167 return 0;
7168 }
7169
7170 if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7171 --size;
7172
7173 done = decode_code_page_strict(code_page, v, s, size);
7174 if (done == -2)
7175 done = decode_code_page_errors(code_page, v, s, size, errors);
7176 return done;
7177}
7178
7179static PyObject *
7180decode_code_page_stateful(int code_page,
7181 const char *s,
7182 Py_ssize_t size,
7183 const char *errors,
7184 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185{
7186 PyUnicodeObject *v = NULL;
7187 int done;
7188
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 if (code_page < 0) {
7190 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7191 return NULL;
7192 }
7193
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007195 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007196
7197#ifdef NEED_RETRY
7198 retry:
7199 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007200 done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201 else
7202#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007204
7205 if (done < 0) {
7206 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208 }
7209
7210 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007211 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007212
7213#ifdef NEED_RETRY
7214 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 s += done;
7216 size -= done;
7217 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007218 }
7219#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007220
Victor Stinner17efeed2011-10-04 20:05:46 +02007221#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007222 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007223 Py_DECREF(v);
7224 return NULL;
7225 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007226#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007227 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007228 return (PyObject *)v;
7229}
7230
Alexander Belopolsky40018472011-02-26 01:02:56 +00007231PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007232PyUnicode_DecodeCodePageStateful(int code_page,
7233 const char *s,
7234 Py_ssize_t size,
7235 const char *errors,
7236 Py_ssize_t *consumed)
7237{
7238 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7239}
7240
7241PyObject *
7242PyUnicode_DecodeMBCSStateful(const char *s,
7243 Py_ssize_t size,
7244 const char *errors,
7245 Py_ssize_t *consumed)
7246{
7247 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7248}
7249
7250PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007251PyUnicode_DecodeMBCS(const char *s,
7252 Py_ssize_t size,
7253 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007254{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7256}
7257
Victor Stinner3a50e702011-10-18 21:21:00 +02007258static DWORD
7259encode_code_page_flags(UINT code_page, const char *errors)
7260{
7261 if (code_page == CP_UTF8) {
7262 if (winver.dwMajorVersion >= 6)
7263 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7264 and later */
7265 return WC_ERR_INVALID_CHARS;
7266 else
7267 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7268 return 0;
7269 }
7270 else if (code_page == CP_UTF7) {
7271 /* CP_UTF7 only supports flags=0 */
7272 return 0;
7273 }
7274 else {
7275 if (errors != NULL && strcmp(errors, "replace") == 0)
7276 return 0;
7277 else
7278 return WC_NO_BEST_FIT_CHARS;
7279 }
7280}
7281
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007282/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007283 * Encode a Unicode string to a Windows code page into a byte string in strict
7284 * mode.
7285 *
7286 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7287 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007289static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007290encode_code_page_strict(UINT code_page, PyObject **outbytes,
7291 const Py_UNICODE *p, const int size,
7292 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293{
Victor Stinner554f3f02010-06-16 23:33:54 +00007294 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007295 BOOL *pusedDefaultChar = &usedDefaultChar;
7296 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007297 PyObject *exc = NULL;
Victor Stinner3a50e702011-10-18 21:21:00 +02007298 const DWORD flags = encode_code_page_flags(code_page, NULL);
7299 char *out;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007300
Victor Stinner3a50e702011-10-18 21:21:00 +02007301 assert(size > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007302
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007304 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007306 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007307
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007308 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 outsize = WideCharToMultiByte(code_page, flags,
7310 p, size,
7311 NULL, 0,
7312 NULL, pusedDefaultChar);
7313 if (outsize <= 0)
7314 goto error;
7315 /* If we used a default char, then we failed! */
7316 if (pusedDefaultChar && *pusedDefaultChar)
7317 return -2;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007318
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007320 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007321 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7322 if (*outbytes == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007323 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007325 }
7326 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007327 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 const Py_ssize_t n = PyBytes_Size(*outbytes);
7329 if (outsize > PY_SSIZE_T_MAX - n) {
7330 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 }
7333 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7334 return -1;
7335 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007336 }
7337
7338 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 outsize = WideCharToMultiByte(code_page, flags,
7340 p, size,
7341 out, outsize,
7342 NULL, pusedDefaultChar);
7343 if (outsize <= 0)
7344 goto error;
7345 if (pusedDefaultChar && *pusedDefaultChar)
7346 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007348
Victor Stinner3a50e702011-10-18 21:21:00 +02007349error:
7350 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7351 return -2;
7352 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007353 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007354}
7355
Victor Stinner3a50e702011-10-18 21:21:00 +02007356/*
7357 * Encode a Unicode string to a Windows code page into a byte string using a
7358 * error handler.
7359 *
7360 * Returns consumed characters if succeed, or raise a WindowsError and returns
7361 * -1 on other error.
7362 */
7363static int
7364encode_code_page_errors(UINT code_page, PyObject **outbytes,
7365 const Py_UNICODE *in, const int insize,
7366 const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007367{
Victor Stinner3a50e702011-10-18 21:21:00 +02007368 const DWORD flags = encode_code_page_flags(code_page, errors);
7369 const Py_UNICODE *startin = in;
7370 const Py_UNICODE *endin = in + insize;
7371 /* Ideally, we should get reason from FormatMessage. This is the Windows
7372 2000 English version of the message. */
7373 const char *reason = "invalid character";
7374 /* 4=maximum length of a UTF-8 sequence */
7375 char buffer[4];
7376 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7377 Py_ssize_t outsize;
7378 char *out;
7379 int charsize;
7380 PyObject *errorHandler = NULL;
7381 PyObject *exc = NULL;
7382 PyObject *encoding_obj = NULL;
7383 char *encoding;
7384 int err;
7385 Py_ssize_t startpos, newpos, newoutsize;
7386 PyObject *rep;
7387 int ret = -1;
7388
7389 assert(insize > 0);
7390
7391 encoding = code_page_name(code_page, &encoding_obj);
7392 if (encoding == NULL)
7393 return -1;
7394
7395 if (errors == NULL || strcmp(errors, "strict") == 0) {
7396 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7397 then we raise a UnicodeEncodeError. */
7398 make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
7399 if (exc != NULL) {
7400 PyCodec_StrictErrors(exc);
7401 Py_DECREF(exc);
7402 }
7403 Py_XDECREF(encoding_obj);
7404 return -1;
7405 }
7406
7407 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7408 pusedDefaultChar = &usedDefaultChar;
7409 else
7410 pusedDefaultChar = NULL;
7411
7412 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7413 PyErr_NoMemory();
7414 goto error;
7415 }
7416 outsize = insize * Py_ARRAY_LENGTH(buffer);
7417
7418 if (*outbytes == NULL) {
7419 /* Create string object */
7420 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7421 if (*outbytes == NULL)
7422 goto error;
7423 out = PyBytes_AS_STRING(*outbytes);
7424 }
7425 else {
7426 /* Extend string object */
7427 Py_ssize_t n = PyBytes_Size(*outbytes);
7428 if (n > PY_SSIZE_T_MAX - outsize) {
7429 PyErr_NoMemory();
7430 goto error;
7431 }
7432 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7433 goto error;
7434 out = PyBytes_AS_STRING(*outbytes) + n;
7435 }
7436
7437 /* Encode the string character per character */
7438 while (in < endin)
7439 {
7440 if ((in + 2) <= endin
7441 && 0xD800 <= in[0] && in[0] <= 0xDBFF
7442 && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
7443 charsize = 2;
7444 else
7445 charsize = 1;
7446
7447 outsize = WideCharToMultiByte(code_page, flags,
7448 in, charsize,
7449 buffer, Py_ARRAY_LENGTH(buffer),
7450 NULL, pusedDefaultChar);
7451 if (outsize > 0) {
7452 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7453 {
7454 in += charsize;
7455 memcpy(out, buffer, outsize);
7456 out += outsize;
7457 continue;
7458 }
7459 }
7460 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7461 PyErr_SetFromWindowsErr(0);
7462 goto error;
7463 }
7464
7465 charsize = Py_MAX(charsize - 1, 1);
7466 startpos = in - startin;
7467 rep = unicode_encode_call_errorhandler(
7468 errors, &errorHandler, encoding, reason,
7469 startin, insize, &exc,
7470 startpos, startpos + charsize, &newpos);
7471 if (rep == NULL)
7472 goto error;
7473 in = startin + newpos;
7474
7475 if (PyBytes_Check(rep)) {
7476 outsize = PyBytes_GET_SIZE(rep);
7477 if (outsize != 1) {
7478 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7479 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7480 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7481 Py_DECREF(rep);
7482 goto error;
7483 }
7484 out = PyBytes_AS_STRING(*outbytes) + offset;
7485 }
7486 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7487 out += outsize;
7488 }
7489 else {
7490 Py_ssize_t i;
7491 enum PyUnicode_Kind kind;
7492 void *data;
7493
7494 if (PyUnicode_READY(rep) < 0) {
7495 Py_DECREF(rep);
7496 goto error;
7497 }
7498
7499 outsize = PyUnicode_GET_LENGTH(rep);
7500 if (outsize != 1) {
7501 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7502 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7503 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7504 Py_DECREF(rep);
7505 goto error;
7506 }
7507 out = PyBytes_AS_STRING(*outbytes) + offset;
7508 }
7509 kind = PyUnicode_KIND(rep);
7510 data = PyUnicode_DATA(rep);
7511 for (i=0; i < outsize; i++) {
7512 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7513 if (ch > 127) {
7514 raise_encode_exception(&exc,
7515 encoding,
7516 startin, insize,
7517 startpos, startpos + charsize,
7518 "unable to encode error handler result to ASCII");
7519 Py_DECREF(rep);
7520 goto error;
7521 }
7522 *out = (unsigned char)ch;
7523 out++;
7524 }
7525 }
7526 Py_DECREF(rep);
7527 }
7528 /* write a NUL byte */
7529 *out = 0;
7530 outsize = out - PyBytes_AS_STRING(*outbytes);
7531 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7532 if (_PyBytes_Resize(outbytes, outsize) < 0)
7533 goto error;
7534 ret = 0;
7535
7536error:
7537 Py_XDECREF(encoding_obj);
7538 Py_XDECREF(errorHandler);
7539 Py_XDECREF(exc);
7540 return ret;
7541}
7542
7543/*
7544 * Encode a Unicode string to a Windows code page into a byte string.
7545 *
7546 * Returns consumed characters if succeed, or raise a WindowsError and returns
7547 * -1 on other error.
7548 */
7549static int
7550encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7551 const Py_UNICODE *p, int size,
7552 const char* errors)
7553{
7554 int done;
7555
7556 if (size == 0) {
7557 if (*outbytes == NULL) {
7558 *outbytes = PyBytes_FromStringAndSize(NULL, 0);
7559 if (*outbytes == NULL)
7560 return -1;
7561 }
7562 return 0;
7563 }
7564
7565 done = encode_code_page_strict(code_page, outbytes, p, size, errors);
7566 if (done == -2)
7567 done = encode_code_page_errors(code_page, outbytes, p, size, errors);
7568 return done;
7569}
7570
7571static PyObject *
7572encode_code_page(int code_page,
7573 const Py_UNICODE *p, Py_ssize_t size,
7574 const char *errors)
7575{
7576 PyObject *outbytes = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007577 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007578
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 if (code_page < 0) {
7580 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7581 return NULL;
7582 }
7583
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007584#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588 else
7589#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007591
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592 if (ret < 0) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 Py_XDECREF(outbytes);
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007595 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596
7597#ifdef NEED_RETRY
7598 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 p += INT_MAX;
7600 size -= INT_MAX;
7601 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602 }
7603#endif
7604
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 return outbytes;
7606}
7607
7608PyObject *
7609PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7610 Py_ssize_t size,
7611 const char *errors)
7612{
7613 return encode_code_page(CP_ACP, p, size, errors);
7614}
7615
7616PyObject *
7617PyUnicode_EncodeCodePage(int code_page,
7618 PyObject *unicode,
7619 const char *errors)
7620{
7621 const Py_UNICODE *p;
7622 Py_ssize_t size;
7623 p = PyUnicode_AsUnicodeAndSize(unicode, &size);
7624 if (p == NULL)
7625 return NULL;
7626 return encode_code_page(code_page, p, size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007627}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007628
Alexander Belopolsky40018472011-02-26 01:02:56 +00007629PyObject *
7630PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007631{
7632 if (!PyUnicode_Check(unicode)) {
7633 PyErr_BadArgument();
7634 return NULL;
7635 }
7636 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 PyUnicode_GET_SIZE(unicode),
7638 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007639}
7640
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641#undef NEED_RETRY
7642
Victor Stinner99b95382011-07-04 14:23:54 +02007643#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007644
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645/* --- Character Mapping Codec -------------------------------------------- */
7646
Alexander Belopolsky40018472011-02-26 01:02:56 +00007647PyObject *
7648PyUnicode_DecodeCharmap(const char *s,
7649 Py_ssize_t size,
7650 PyObject *mapping,
7651 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007654 Py_ssize_t startinpos;
7655 Py_ssize_t endinpos;
7656 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007657 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 PyUnicodeObject *v;
7659 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007660 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 PyObject *errorHandler = NULL;
7662 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007663 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007664 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007665
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 /* Default to Latin-1 */
7667 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670 v = _PyUnicode_New(size);
7671 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007677 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 mapstring = PyUnicode_AS_UNICODE(mapping);
7679 maplen = PyUnicode_GET_SIZE(mapping);
7680 while (s < e) {
7681 unsigned char ch = *s;
7682 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 if (ch < maplen)
7685 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 if (x == 0xfffe) {
7688 /* undefined mapping */
7689 outpos = p-PyUnicode_AS_UNICODE(v);
7690 startinpos = s-starts;
7691 endinpos = startinpos+1;
7692 if (unicode_decode_call_errorhandler(
7693 errors, &errorHandler,
7694 "charmap", "character maps to <undefined>",
7695 &starts, &e, &startinpos, &endinpos, &exc, &s,
7696 &v, &outpos, &p)) {
7697 goto onError;
7698 }
7699 continue;
7700 }
7701 *p++ = x;
7702 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007703 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007704 }
7705 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 while (s < e) {
7707 unsigned char ch = *s;
7708 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007709
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7711 w = PyLong_FromLong((long)ch);
7712 if (w == NULL)
7713 goto onError;
7714 x = PyObject_GetItem(mapping, w);
7715 Py_DECREF(w);
7716 if (x == NULL) {
7717 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7718 /* No mapping found means: mapping is undefined. */
7719 PyErr_Clear();
7720 x = Py_None;
7721 Py_INCREF(x);
7722 } else
7723 goto onError;
7724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 /* Apply mapping */
7727 if (PyLong_Check(x)) {
7728 long value = PyLong_AS_LONG(x);
7729 if (value < 0 || value > 65535) {
7730 PyErr_SetString(PyExc_TypeError,
7731 "character mapping must be in range(65536)");
7732 Py_DECREF(x);
7733 goto onError;
7734 }
7735 *p++ = (Py_UNICODE)value;
7736 }
7737 else if (x == Py_None) {
7738 /* undefined mapping */
7739 outpos = p-PyUnicode_AS_UNICODE(v);
7740 startinpos = s-starts;
7741 endinpos = startinpos+1;
7742 if (unicode_decode_call_errorhandler(
7743 errors, &errorHandler,
7744 "charmap", "character maps to <undefined>",
7745 &starts, &e, &startinpos, &endinpos, &exc, &s,
7746 &v, &outpos, &p)) {
7747 Py_DECREF(x);
7748 goto onError;
7749 }
7750 Py_DECREF(x);
7751 continue;
7752 }
7753 else if (PyUnicode_Check(x)) {
7754 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 if (targetsize == 1)
7757 /* 1-1 mapping */
7758 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 else if (targetsize > 1) {
7761 /* 1-n mapping */
7762 if (targetsize > extrachars) {
7763 /* resize first */
7764 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7765 Py_ssize_t needed = (targetsize - extrachars) + \
7766 (targetsize << 2);
7767 extrachars += needed;
7768 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007769 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 PyUnicode_GET_SIZE(v) + needed) < 0) {
7771 Py_DECREF(x);
7772 goto onError;
7773 }
7774 p = PyUnicode_AS_UNICODE(v) + oldpos;
7775 }
7776 Py_UNICODE_COPY(p,
7777 PyUnicode_AS_UNICODE(x),
7778 targetsize);
7779 p += targetsize;
7780 extrachars -= targetsize;
7781 }
7782 /* 1-0 mapping: skip the character */
7783 }
7784 else {
7785 /* wrong return value */
7786 PyErr_SetString(PyExc_TypeError,
7787 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007788 Py_DECREF(x);
7789 goto onError;
7790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 Py_DECREF(x);
7792 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 }
7795 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007796 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798 Py_XDECREF(errorHandler);
7799 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007800#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007801 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 Py_DECREF(v);
7803 return NULL;
7804 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007805#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007806 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007808
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810 Py_XDECREF(errorHandler);
7811 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 Py_XDECREF(v);
7813 return NULL;
7814}
7815
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816/* Charmap encoding: the lookup table */
7817
Alexander Belopolsky40018472011-02-26 01:02:56 +00007818struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 PyObject_HEAD
7820 unsigned char level1[32];
7821 int count2, count3;
7822 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823};
7824
7825static PyObject*
7826encoding_map_size(PyObject *obj, PyObject* args)
7827{
7828 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831}
7832
7833static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 PyDoc_STR("Return the size (in bytes) of this object") },
7836 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837};
7838
7839static void
7840encoding_map_dealloc(PyObject* o)
7841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007842 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843}
7844
7845static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007846 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 "EncodingMap", /*tp_name*/
7848 sizeof(struct encoding_map), /*tp_basicsize*/
7849 0, /*tp_itemsize*/
7850 /* methods */
7851 encoding_map_dealloc, /*tp_dealloc*/
7852 0, /*tp_print*/
7853 0, /*tp_getattr*/
7854 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007855 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 0, /*tp_repr*/
7857 0, /*tp_as_number*/
7858 0, /*tp_as_sequence*/
7859 0, /*tp_as_mapping*/
7860 0, /*tp_hash*/
7861 0, /*tp_call*/
7862 0, /*tp_str*/
7863 0, /*tp_getattro*/
7864 0, /*tp_setattro*/
7865 0, /*tp_as_buffer*/
7866 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7867 0, /*tp_doc*/
7868 0, /*tp_traverse*/
7869 0, /*tp_clear*/
7870 0, /*tp_richcompare*/
7871 0, /*tp_weaklistoffset*/
7872 0, /*tp_iter*/
7873 0, /*tp_iternext*/
7874 encoding_map_methods, /*tp_methods*/
7875 0, /*tp_members*/
7876 0, /*tp_getset*/
7877 0, /*tp_base*/
7878 0, /*tp_dict*/
7879 0, /*tp_descr_get*/
7880 0, /*tp_descr_set*/
7881 0, /*tp_dictoffset*/
7882 0, /*tp_init*/
7883 0, /*tp_alloc*/
7884 0, /*tp_new*/
7885 0, /*tp_free*/
7886 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887};
7888
7889PyObject*
7890PyUnicode_BuildEncodingMap(PyObject* string)
7891{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007892 PyObject *result;
7893 struct encoding_map *mresult;
7894 int i;
7895 int need_dict = 0;
7896 unsigned char level1[32];
7897 unsigned char level2[512];
7898 unsigned char *mlevel1, *mlevel2, *mlevel3;
7899 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 int kind;
7901 void *data;
7902 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 PyErr_BadArgument();
7906 return NULL;
7907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 kind = PyUnicode_KIND(string);
7909 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 memset(level1, 0xFF, sizeof level1);
7911 memset(level2, 0xFF, sizeof level2);
7912
7913 /* If there isn't a one-to-one mapping of NULL to \0,
7914 or if there are non-BMP characters, we need to use
7915 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007916 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917 need_dict = 1;
7918 for (i = 1; i < 256; i++) {
7919 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 ch = PyUnicode_READ(kind, data, i);
7921 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 need_dict = 1;
7923 break;
7924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007925 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 /* unmapped character */
7927 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 l1 = ch >> 11;
7929 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930 if (level1[l1] == 0xFF)
7931 level1[l1] = count2++;
7932 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007933 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 }
7935
7936 if (count2 >= 0xFF || count3 >= 0xFF)
7937 need_dict = 1;
7938
7939 if (need_dict) {
7940 PyObject *result = PyDict_New();
7941 PyObject *key, *value;
7942 if (!result)
7943 return NULL;
7944 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007945 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007946 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 if (!key || !value)
7948 goto failed1;
7949 if (PyDict_SetItem(result, key, value) == -1)
7950 goto failed1;
7951 Py_DECREF(key);
7952 Py_DECREF(value);
7953 }
7954 return result;
7955 failed1:
7956 Py_XDECREF(key);
7957 Py_XDECREF(value);
7958 Py_DECREF(result);
7959 return NULL;
7960 }
7961
7962 /* Create a three-level trie */
7963 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7964 16*count2 + 128*count3 - 1);
7965 if (!result)
7966 return PyErr_NoMemory();
7967 PyObject_Init(result, &EncodingMapType);
7968 mresult = (struct encoding_map*)result;
7969 mresult->count2 = count2;
7970 mresult->count3 = count3;
7971 mlevel1 = mresult->level1;
7972 mlevel2 = mresult->level23;
7973 mlevel3 = mresult->level23 + 16*count2;
7974 memcpy(mlevel1, level1, 32);
7975 memset(mlevel2, 0xFF, 16*count2);
7976 memset(mlevel3, 0, 128*count3);
7977 count3 = 0;
7978 for (i = 1; i < 256; i++) {
7979 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007980 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007981 /* unmapped character */
7982 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 o1 = PyUnicode_READ(kind, data, i)>>11;
7984 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985 i2 = 16*mlevel1[o1] + o2;
7986 if (mlevel2[i2] == 0xFF)
7987 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007989 i3 = 128*mlevel2[i2] + o3;
7990 mlevel3[i3] = i;
7991 }
7992 return result;
7993}
7994
7995static int
7996encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7997{
7998 struct encoding_map *map = (struct encoding_map*)mapping;
7999 int l1 = c>>11;
8000 int l2 = (c>>7) & 0xF;
8001 int l3 = c & 0x7F;
8002 int i;
8003
8004#ifdef Py_UNICODE_WIDE
8005 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007 }
8008#endif
8009 if (c == 0)
8010 return 0;
8011 /* level 1*/
8012 i = map->level1[l1];
8013 if (i == 0xFF) {
8014 return -1;
8015 }
8016 /* level 2*/
8017 i = map->level23[16*i+l2];
8018 if (i == 0xFF) {
8019 return -1;
8020 }
8021 /* level 3 */
8022 i = map->level23[16*map->count2 + 128*i + l3];
8023 if (i == 0) {
8024 return -1;
8025 }
8026 return i;
8027}
8028
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008029/* Lookup the character ch in the mapping. If the character
8030 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008031 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008032static PyObject *
8033charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
Christian Heimes217cfd12007-12-02 14:31:20 +00008035 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036 PyObject *x;
8037
8038 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040 x = PyObject_GetItem(mapping, w);
8041 Py_DECREF(w);
8042 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8044 /* No mapping found means: mapping is undefined. */
8045 PyErr_Clear();
8046 x = Py_None;
8047 Py_INCREF(x);
8048 return x;
8049 } else
8050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008052 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008054 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 long value = PyLong_AS_LONG(x);
8056 if (value < 0 || value > 255) {
8057 PyErr_SetString(PyExc_TypeError,
8058 "character mapping must be in range(256)");
8059 Py_DECREF(x);
8060 return NULL;
8061 }
8062 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 /* wrong return value */
8068 PyErr_Format(PyExc_TypeError,
8069 "character mapping must return integer, bytes or None, not %.400s",
8070 x->ob_type->tp_name);
8071 Py_DECREF(x);
8072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 }
8074}
8075
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008077charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8080 /* exponentially overallocate to minimize reallocations */
8081 if (requiredsize < 2*outsize)
8082 requiredsize = 2*outsize;
8083 if (_PyBytes_Resize(outobj, requiredsize))
8084 return -1;
8085 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086}
8087
Benjamin Peterson14339b62009-01-31 16:36:08 +00008088typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008092 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 space is available. Return a new reference to the object that
8094 was put in the output buffer, or Py_None, if the mapping was undefined
8095 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008096 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008097static charmapencode_result
8098charmapencode_output(Py_UNICODE c, PyObject *mapping,
8099 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 PyObject *rep;
8102 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008103 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104
Christian Heimes90aa7642007-12-19 02:45:37 +00008105 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 if (res == -1)
8109 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 if (outsize<requiredsize)
8111 if (charmapencode_resize(outobj, outpos, requiredsize))
8112 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008113 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 outstart[(*outpos)++] = (char)res;
8115 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 }
8117
8118 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 Py_DECREF(rep);
8123 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 if (PyLong_Check(rep)) {
8126 Py_ssize_t requiredsize = *outpos+1;
8127 if (outsize<requiredsize)
8128 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8129 Py_DECREF(rep);
8130 return enc_EXCEPTION;
8131 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008132 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 else {
8136 const char *repchars = PyBytes_AS_STRING(rep);
8137 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8138 Py_ssize_t requiredsize = *outpos+repsize;
8139 if (outsize<requiredsize)
8140 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8141 Py_DECREF(rep);
8142 return enc_EXCEPTION;
8143 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008144 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 memcpy(outstart + *outpos, repchars, repsize);
8146 *outpos += repsize;
8147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 Py_DECREF(rep);
8150 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151}
8152
8153/* handle an error in PyUnicode_EncodeCharmap
8154 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008155static int
8156charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008159 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008160 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008161{
8162 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008163 Py_ssize_t repsize;
8164 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 Py_UNICODE *uni2;
8166 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t collstartpos = *inpos;
8168 Py_ssize_t collendpos = *inpos+1;
8169 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 char *encoding = "charmap";
8171 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174 /* find all unencodable characters */
8175 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008177 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 int res = encoding_map_lookup(p[collendpos], mapping);
8179 if (res != -1)
8180 break;
8181 ++collendpos;
8182 continue;
8183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 rep = charmapencode_lookup(p[collendpos], mapping);
8186 if (rep==NULL)
8187 return -1;
8188 else if (rep!=Py_None) {
8189 Py_DECREF(rep);
8190 break;
8191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 }
8195 /* cache callback name lookup
8196 * (if not done yet, i.e. it's the first error) */
8197 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 if ((errors==NULL) || (!strcmp(errors, "strict")))
8199 *known_errorHandler = 1;
8200 else if (!strcmp(errors, "replace"))
8201 *known_errorHandler = 2;
8202 else if (!strcmp(errors, "ignore"))
8203 *known_errorHandler = 3;
8204 else if (!strcmp(errors, "xmlcharrefreplace"))
8205 *known_errorHandler = 4;
8206 else
8207 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208 }
8209 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 case 1: /* strict */
8211 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8212 return -1;
8213 case 2: /* replace */
8214 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 x = charmapencode_output('?', mapping, res, respos);
8216 if (x==enc_EXCEPTION) {
8217 return -1;
8218 }
8219 else if (x==enc_FAILED) {
8220 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8221 return -1;
8222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 }
8224 /* fall through */
8225 case 3: /* ignore */
8226 *inpos = collendpos;
8227 break;
8228 case 4: /* xmlcharrefreplace */
8229 /* generate replacement (temporarily (mis)uses p) */
8230 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 char buffer[2+29+1+1];
8232 char *cp;
8233 sprintf(buffer, "&#%d;", (int)p[collpos]);
8234 for (cp = buffer; *cp; ++cp) {
8235 x = charmapencode_output(*cp, mapping, res, respos);
8236 if (x==enc_EXCEPTION)
8237 return -1;
8238 else if (x==enc_FAILED) {
8239 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8240 return -1;
8241 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 }
8243 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008244 *inpos = collendpos;
8245 break;
8246 default:
8247 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 encoding, reason, p, size, exceptionObject,
8249 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008252 if (PyBytes_Check(repunicode)) {
8253 /* Directly copy bytes result to output. */
8254 Py_ssize_t outsize = PyBytes_Size(*res);
8255 Py_ssize_t requiredsize;
8256 repsize = PyBytes_Size(repunicode);
8257 requiredsize = *respos + repsize;
8258 if (requiredsize > outsize)
8259 /* Make room for all additional bytes. */
8260 if (charmapencode_resize(res, respos, requiredsize)) {
8261 Py_DECREF(repunicode);
8262 return -1;
8263 }
8264 memcpy(PyBytes_AsString(*res) + *respos,
8265 PyBytes_AsString(repunicode), repsize);
8266 *respos += repsize;
8267 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008268 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008269 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008270 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 /* generate replacement */
8272 repsize = PyUnicode_GET_SIZE(repunicode);
8273 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 x = charmapencode_output(*uni2, mapping, res, respos);
8275 if (x==enc_EXCEPTION) {
8276 return -1;
8277 }
8278 else if (x==enc_FAILED) {
8279 Py_DECREF(repunicode);
8280 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8281 return -1;
8282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 }
8284 *inpos = newpos;
8285 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 }
8287 return 0;
8288}
8289
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290PyObject *
8291PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8292 Py_ssize_t size,
8293 PyObject *mapping,
8294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 /* output object */
8297 PyObject *res = NULL;
8298 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 PyObject *errorHandler = NULL;
8303 PyObject *exc = NULL;
8304 /* the following variable is used for caching string comparisons
8305 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8306 * 3=ignore, 4=xmlcharrefreplace */
8307 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
8309 /* Default to Latin-1 */
8310 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 /* allocate enough for a simple encoding without
8314 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 if (res == NULL)
8317 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008318 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 /* try to encode it */
8323 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
8324 if (x==enc_EXCEPTION) /* error */
8325 goto onError;
8326 if (x==enc_FAILED) { /* unencodable character */
8327 if (charmap_encoding_error(p, size, &inpos, mapping,
8328 &exc,
8329 &known_errorHandler, &errorHandler, errors,
8330 &res, &respos)) {
8331 goto onError;
8332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 else
8335 /* done with this character => adjust input position */
8336 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008340 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008341 if (_PyBytes_Resize(&res, respos) < 0)
8342 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 Py_XDECREF(exc);
8345 Py_XDECREF(errorHandler);
8346 return res;
8347
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 Py_XDECREF(res);
8350 Py_XDECREF(exc);
8351 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 return NULL;
8353}
8354
Alexander Belopolsky40018472011-02-26 01:02:56 +00008355PyObject *
8356PyUnicode_AsCharmapString(PyObject *unicode,
8357 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358{
8359 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 PyErr_BadArgument();
8361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 }
8363 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 PyUnicode_GET_SIZE(unicode),
8365 mapping,
8366 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static void
8371make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373 Py_ssize_t startpos, Py_ssize_t endpos,
8374 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 *exceptionObject = _PyUnicodeTranslateError_Create(
8378 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
8380 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8384 goto onError;
8385 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8386 goto onError;
8387 return;
8388 onError:
8389 Py_DECREF(*exceptionObject);
8390 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 }
8392}
8393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static void
8396raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008398 Py_ssize_t startpos, Py_ssize_t endpos,
8399 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400{
8401 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405}
8406
8407/* error handling callback helper:
8408 build arguments, call the callback and check the arguments,
8409 put the result into newpos and return the replacement string, which
8410 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008411static PyObject *
8412unicode_translate_call_errorhandler(const char *errors,
8413 PyObject **errorHandler,
8414 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008416 Py_ssize_t startpos, Py_ssize_t endpos,
8417 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008419 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008421 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 PyObject *restuple;
8423 PyObject *resunicode;
8424
8425 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430
8431 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435
8436 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 Py_DECREF(restuple);
8443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 }
8445 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 &resunicode, &i_newpos)) {
8447 Py_DECREF(restuple);
8448 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 else
8453 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8456 Py_DECREF(restuple);
8457 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008458 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 Py_INCREF(resunicode);
8460 Py_DECREF(restuple);
8461 return resunicode;
8462}
8463
8464/* Lookup the character ch in the mapping and put the result in result,
8465 which must be decrefed by the caller.
8466 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469{
Christian Heimes217cfd12007-12-02 14:31:20 +00008470 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 PyObject *x;
8472
8473 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 x = PyObject_GetItem(mapping, w);
8476 Py_DECREF(w);
8477 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8479 /* No mapping found means: use 1:1 mapping. */
8480 PyErr_Clear();
8481 *result = NULL;
8482 return 0;
8483 } else
8484 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 }
8486 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 *result = x;
8488 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008490 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 long value = PyLong_AS_LONG(x);
8492 long max = PyUnicode_GetMax();
8493 if (value < 0 || value > max) {
8494 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008495 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 Py_DECREF(x);
8497 return -1;
8498 }
8499 *result = x;
8500 return 0;
8501 }
8502 else if (PyUnicode_Check(x)) {
8503 *result = x;
8504 return 0;
8505 }
8506 else {
8507 /* wrong return value */
8508 PyErr_SetString(PyExc_TypeError,
8509 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 Py_DECREF(x);
8511 return -1;
8512 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513}
8514/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 if not reallocate and adjust various state variables.
8516 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008517static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008522 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 /* exponentially overallocate to minimize reallocations */
8524 if (requiredsize < 2 * oldsize)
8525 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8527 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
8531 return 0;
8532}
8533/* lookup the character, put the result in the output string and adjust
8534 various state variables. Return a new reference to the object that
8535 was put in the output buffer in *result, or Py_None, if the mapping was
8536 undefined (in which case no character was written).
8537 The called must decref result.
8538 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008539static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8541 PyObject *mapping, Py_UCS4 **output,
8542 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8546 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 }
8552 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008554 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 }
8558 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 Py_ssize_t repsize;
8560 if (PyUnicode_READY(*res) == -1)
8561 return -1;
8562 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 if (repsize==1) {
8564 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 }
8567 else if (repsize!=0) {
8568 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 Py_ssize_t requiredsize = *opos +
8570 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 Py_ssize_t i;
8573 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 for(i = 0; i < repsize; i++)
8576 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 }
8579 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 return 0;
8582}
8583
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585_PyUnicode_TranslateCharmap(PyObject *input,
8586 PyObject *mapping,
8587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 /* input object */
8590 char *idata;
8591 Py_ssize_t size, i;
8592 int kind;
8593 /* output buffer */
8594 Py_UCS4 *output = NULL;
8595 Py_ssize_t osize;
8596 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 char *reason = "character maps to <undefined>";
8600 PyObject *errorHandler = NULL;
8601 PyObject *exc = NULL;
8602 /* the following variable is used for caching string comparisons
8603 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8604 * 3=ignore, 4=xmlcharrefreplace */
8605 int known_errorHandler = -1;
8606
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 PyErr_BadArgument();
8609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 if (PyUnicode_READY(input) == -1)
8613 return NULL;
8614 idata = (char*)PyUnicode_DATA(input);
8615 kind = PyUnicode_KIND(input);
8616 size = PyUnicode_GET_LENGTH(input);
8617 i = 0;
8618
8619 if (size == 0) {
8620 Py_INCREF(input);
8621 return input;
8622 }
8623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 /* allocate enough for a simple 1:1 translation without
8625 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 osize = size;
8627 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8628 opos = 0;
8629 if (output == NULL) {
8630 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 /* try to encode it */
8636 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 if (charmaptranslate_output(input, i, mapping,
8638 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 Py_XDECREF(x);
8640 goto onError;
8641 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008642 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 else { /* untranslatable character */
8646 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8647 Py_ssize_t repsize;
8648 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 Py_ssize_t collstart = i;
8652 Py_ssize_t collend = i+1;
8653 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 while (collend < size) {
8657 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 goto onError;
8659 Py_XDECREF(x);
8660 if (x!=Py_None)
8661 break;
8662 ++collend;
8663 }
8664 /* cache callback name lookup
8665 * (if not done yet, i.e. it's the first error) */
8666 if (known_errorHandler==-1) {
8667 if ((errors==NULL) || (!strcmp(errors, "strict")))
8668 known_errorHandler = 1;
8669 else if (!strcmp(errors, "replace"))
8670 known_errorHandler = 2;
8671 else if (!strcmp(errors, "ignore"))
8672 known_errorHandler = 3;
8673 else if (!strcmp(errors, "xmlcharrefreplace"))
8674 known_errorHandler = 4;
8675 else
8676 known_errorHandler = 0;
8677 }
8678 switch (known_errorHandler) {
8679 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 raise_translate_exception(&exc, input, collstart,
8681 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008682 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 case 2: /* replace */
8684 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 for (coll = collstart; coll<collend; coll++)
8686 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 /* fall through */
8688 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 break;
8691 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 /* generate replacement (temporarily (mis)uses i) */
8693 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 char buffer[2+29+1+1];
8695 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8697 if (charmaptranslate_makespace(&output, &osize,
8698 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 goto onError;
8700 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 break;
8705 default:
8706 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 reason, input, &exc,
8708 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008709 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 goto onError;
8711 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 repsize = PyUnicode_GET_LENGTH(repunicode);
8713 if (charmaptranslate_makespace(&output, &osize,
8714 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 Py_DECREF(repunicode);
8716 goto onError;
8717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 for (uni2 = 0; repsize-->0; ++uni2)
8719 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8720 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 }
8724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8726 if (!res)
8727 goto onError;
8728 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 Py_XDECREF(exc);
8730 Py_XDECREF(errorHandler);
8731 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 Py_XDECREF(exc);
8736 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 return NULL;
8738}
8739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740/* Deprecated. Use PyUnicode_Translate instead. */
8741PyObject *
8742PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8743 Py_ssize_t size,
8744 PyObject *mapping,
8745 const char *errors)
8746{
8747 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8748 if (!unicode)
8749 return NULL;
8750 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8751}
8752
Alexander Belopolsky40018472011-02-26 01:02:56 +00008753PyObject *
8754PyUnicode_Translate(PyObject *str,
8755 PyObject *mapping,
8756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757{
8758 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 str = PyUnicode_FromObject(str);
8761 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 Py_DECREF(str);
8765 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008766
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 Py_XDECREF(str);
8769 return NULL;
8770}
Tim Petersced69f82003-09-16 20:30:58 +00008771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008773fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774{
8775 /* No need to call PyUnicode_READY(self) because this function is only
8776 called as a callback from fixup() which does it already. */
8777 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8778 const int kind = PyUnicode_KIND(self);
8779 void *data = PyUnicode_DATA(self);
8780 Py_UCS4 maxchar = 0, ch, fixed;
8781 Py_ssize_t i;
8782
8783 for (i = 0; i < len; ++i) {
8784 ch = PyUnicode_READ(kind, data, i);
8785 fixed = 0;
8786 if (ch > 127) {
8787 if (Py_UNICODE_ISSPACE(ch))
8788 fixed = ' ';
8789 else {
8790 const int decimal = Py_UNICODE_TODECIMAL(ch);
8791 if (decimal >= 0)
8792 fixed = '0' + decimal;
8793 }
8794 if (fixed != 0) {
8795 if (fixed > maxchar)
8796 maxchar = fixed;
8797 PyUnicode_WRITE(kind, data, i, fixed);
8798 }
8799 else if (ch > maxchar)
8800 maxchar = ch;
8801 }
8802 else if (ch > maxchar)
8803 maxchar = ch;
8804 }
8805
8806 return maxchar;
8807}
8808
8809PyObject *
8810_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8811{
8812 if (!PyUnicode_Check(unicode)) {
8813 PyErr_BadInternalCall();
8814 return NULL;
8815 }
8816 if (PyUnicode_READY(unicode) == -1)
8817 return NULL;
8818 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8819 /* If the string is already ASCII, just return the same string */
8820 Py_INCREF(unicode);
8821 return unicode;
8822 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008823 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824}
8825
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008826PyObject *
8827PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8828 Py_ssize_t length)
8829{
8830 PyObject *result;
8831 Py_UNICODE *p; /* write pointer into result */
8832 Py_ssize_t i;
8833 /* Copy to a new string */
8834 result = (PyObject *)_PyUnicode_New(length);
8835 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8836 if (result == NULL)
8837 return result;
8838 p = PyUnicode_AS_UNICODE(result);
8839 /* Iterate over code points */
8840 for (i = 0; i < length; i++) {
8841 Py_UNICODE ch =s[i];
8842 if (ch > 127) {
8843 int decimal = Py_UNICODE_TODECIMAL(ch);
8844 if (decimal >= 0)
8845 p[i] = '0' + decimal;
8846 }
8847 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008848#ifndef DONT_MAKE_RESULT_READY
8849 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 Py_DECREF(result);
8851 return NULL;
8852 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008853#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008854 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008855 return result;
8856}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008857/* --- Decimal Encoder ---------------------------------------------------- */
8858
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859int
8860PyUnicode_EncodeDecimal(Py_UNICODE *s,
8861 Py_ssize_t length,
8862 char *output,
8863 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864{
8865 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866 PyObject *errorHandler = NULL;
8867 PyObject *exc = NULL;
8868 const char *encoding = "decimal";
8869 const char *reason = "invalid decimal Unicode string";
8870 /* the following variable is used for caching string comparisons
8871 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8872 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008873
8874 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 PyErr_BadArgument();
8876 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008877 }
8878
8879 p = s;
8880 end = s + length;
8881 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 register Py_UNICODE ch = *p;
8883 int decimal;
8884 PyObject *repunicode;
8885 Py_ssize_t repsize;
8886 Py_ssize_t newpos;
8887 Py_UNICODE *uni2;
8888 Py_UNICODE *collstart;
8889 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008890
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008892 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 ++p;
8894 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 decimal = Py_UNICODE_TODECIMAL(ch);
8897 if (decimal >= 0) {
8898 *output++ = '0' + decimal;
8899 ++p;
8900 continue;
8901 }
8902 if (0 < ch && ch < 256) {
8903 *output++ = (char)ch;
8904 ++p;
8905 continue;
8906 }
8907 /* All other characters are considered unencodable */
8908 collstart = p;
8909 collend = p+1;
8910 while (collend < end) {
8911 if ((0 < *collend && *collend < 256) ||
8912 !Py_UNICODE_ISSPACE(*collend) ||
8913 Py_UNICODE_TODECIMAL(*collend))
8914 break;
8915 }
8916 /* cache callback name lookup
8917 * (if not done yet, i.e. it's the first error) */
8918 if (known_errorHandler==-1) {
8919 if ((errors==NULL) || (!strcmp(errors, "strict")))
8920 known_errorHandler = 1;
8921 else if (!strcmp(errors, "replace"))
8922 known_errorHandler = 2;
8923 else if (!strcmp(errors, "ignore"))
8924 known_errorHandler = 3;
8925 else if (!strcmp(errors, "xmlcharrefreplace"))
8926 known_errorHandler = 4;
8927 else
8928 known_errorHandler = 0;
8929 }
8930 switch (known_errorHandler) {
8931 case 1: /* strict */
8932 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8933 goto onError;
8934 case 2: /* replace */
8935 for (p = collstart; p < collend; ++p)
8936 *output++ = '?';
8937 /* fall through */
8938 case 3: /* ignore */
8939 p = collend;
8940 break;
8941 case 4: /* xmlcharrefreplace */
8942 /* generate replacement (temporarily (mis)uses p) */
8943 for (p = collstart; p < collend; ++p)
8944 output += sprintf(output, "&#%d;", (int)*p);
8945 p = collend;
8946 break;
8947 default:
8948 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8949 encoding, reason, s, length, &exc,
8950 collstart-s, collend-s, &newpos);
8951 if (repunicode == NULL)
8952 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008953 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008954 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008955 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8956 Py_DECREF(repunicode);
8957 goto onError;
8958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 /* generate replacement */
8960 repsize = PyUnicode_GET_SIZE(repunicode);
8961 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8962 Py_UNICODE ch = *uni2;
8963 if (Py_UNICODE_ISSPACE(ch))
8964 *output++ = ' ';
8965 else {
8966 decimal = Py_UNICODE_TODECIMAL(ch);
8967 if (decimal >= 0)
8968 *output++ = '0' + decimal;
8969 else if (0 < ch && ch < 256)
8970 *output++ = (char)ch;
8971 else {
8972 Py_DECREF(repunicode);
8973 raise_encode_exception(&exc, encoding,
8974 s, length, collstart-s, collend-s, reason);
8975 goto onError;
8976 }
8977 }
8978 }
8979 p = s + newpos;
8980 Py_DECREF(repunicode);
8981 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008982 }
8983 /* 0-terminate the output string */
8984 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 Py_XDECREF(exc);
8986 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008987 return 0;
8988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008990 Py_XDECREF(exc);
8991 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008992 return -1;
8993}
8994
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995/* --- Helpers ------------------------------------------------------------ */
8996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008998any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 Py_ssize_t start,
9000 Py_ssize_t end)
9001{
9002 int kind1, kind2, kind;
9003 void *buf1, *buf2;
9004 Py_ssize_t len1, len2, result;
9005
9006 kind1 = PyUnicode_KIND(s1);
9007 kind2 = PyUnicode_KIND(s2);
9008 kind = kind1 > kind2 ? kind1 : kind2;
9009 buf1 = PyUnicode_DATA(s1);
9010 buf2 = PyUnicode_DATA(s2);
9011 if (kind1 != kind)
9012 buf1 = _PyUnicode_AsKind(s1, kind);
9013 if (!buf1)
9014 return -2;
9015 if (kind2 != kind)
9016 buf2 = _PyUnicode_AsKind(s2, kind);
9017 if (!buf2) {
9018 if (kind1 != kind) PyMem_Free(buf1);
9019 return -2;
9020 }
9021 len1 = PyUnicode_GET_LENGTH(s1);
9022 len2 = PyUnicode_GET_LENGTH(s2);
9023
Victor Stinner794d5672011-10-10 03:21:36 +02009024 if (direction > 0) {
9025 switch(kind) {
9026 case PyUnicode_1BYTE_KIND:
9027 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9028 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9029 else
9030 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9031 break;
9032 case PyUnicode_2BYTE_KIND:
9033 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9034 break;
9035 case PyUnicode_4BYTE_KIND:
9036 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9037 break;
9038 default:
9039 assert(0); result = -2;
9040 }
9041 }
9042 else {
9043 switch(kind) {
9044 case PyUnicode_1BYTE_KIND:
9045 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9046 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9047 else
9048 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9049 break;
9050 case PyUnicode_2BYTE_KIND:
9051 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9052 break;
9053 case PyUnicode_4BYTE_KIND:
9054 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055 break;
9056 default:
9057 assert(0); result = -2;
9058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 }
9060
9061 if (kind1 != kind)
9062 PyMem_Free(buf1);
9063 if (kind2 != kind)
9064 PyMem_Free(buf2);
9065
9066 return result;
9067}
9068
9069Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009070_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 Py_ssize_t n_buffer,
9072 void *digits, Py_ssize_t n_digits,
9073 Py_ssize_t min_width,
9074 const char *grouping,
9075 const char *thousands_sep)
9076{
9077 switch(kind) {
9078 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009079 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9080 return _PyUnicode_ascii_InsertThousandsGrouping(
9081 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9082 min_width, grouping, thousands_sep);
9083 else
9084 return _PyUnicode_ucs1_InsertThousandsGrouping(
9085 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9086 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 case PyUnicode_2BYTE_KIND:
9088 return _PyUnicode_ucs2_InsertThousandsGrouping(
9089 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9090 min_width, grouping, thousands_sep);
9091 case PyUnicode_4BYTE_KIND:
9092 return _PyUnicode_ucs4_InsertThousandsGrouping(
9093 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9094 min_width, grouping, thousands_sep);
9095 }
9096 assert(0);
9097 return -1;
9098}
9099
9100
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009102#define ADJUST_INDICES(start, end, len) \
9103 if (end > len) \
9104 end = len; \
9105 else if (end < 0) { \
9106 end += len; \
9107 if (end < 0) \
9108 end = 0; \
9109 } \
9110 if (start < 0) { \
9111 start += len; \
9112 if (start < 0) \
9113 start = 0; \
9114 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009115
Alexander Belopolsky40018472011-02-26 01:02:56 +00009116Py_ssize_t
9117PyUnicode_Count(PyObject *str,
9118 PyObject *substr,
9119 Py_ssize_t start,
9120 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009122 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009123 PyUnicodeObject* str_obj;
9124 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 int kind1, kind2, kind;
9126 void *buf1 = NULL, *buf2 = NULL;
9127 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009128
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009132 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009133 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 Py_DECREF(str_obj);
9135 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 }
Tim Petersced69f82003-09-16 20:30:58 +00009137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 kind1 = PyUnicode_KIND(str_obj);
9139 kind2 = PyUnicode_KIND(sub_obj);
9140 kind = kind1 > kind2 ? kind1 : kind2;
9141 buf1 = PyUnicode_DATA(str_obj);
9142 if (kind1 != kind)
9143 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
9144 if (!buf1)
9145 goto onError;
9146 buf2 = PyUnicode_DATA(sub_obj);
9147 if (kind2 != kind)
9148 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
9149 if (!buf2)
9150 goto onError;
9151 len1 = PyUnicode_GET_LENGTH(str_obj);
9152 len2 = PyUnicode_GET_LENGTH(sub_obj);
9153
9154 ADJUST_INDICES(start, end, len1);
9155 switch(kind) {
9156 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009157 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9158 result = asciilib_count(
9159 ((Py_UCS1*)buf1) + start, end - start,
9160 buf2, len2, PY_SSIZE_T_MAX
9161 );
9162 else
9163 result = ucs1lib_count(
9164 ((Py_UCS1*)buf1) + start, end - start,
9165 buf2, len2, PY_SSIZE_T_MAX
9166 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 break;
9168 case PyUnicode_2BYTE_KIND:
9169 result = ucs2lib_count(
9170 ((Py_UCS2*)buf1) + start, end - start,
9171 buf2, len2, PY_SSIZE_T_MAX
9172 );
9173 break;
9174 case PyUnicode_4BYTE_KIND:
9175 result = ucs4lib_count(
9176 ((Py_UCS4*)buf1) + start, end - start,
9177 buf2, len2, PY_SSIZE_T_MAX
9178 );
9179 break;
9180 default:
9181 assert(0); result = 0;
9182 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009183
9184 Py_DECREF(sub_obj);
9185 Py_DECREF(str_obj);
9186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (kind1 != kind)
9188 PyMem_Free(buf1);
9189 if (kind2 != kind)
9190 PyMem_Free(buf2);
9191
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 onError:
9194 Py_DECREF(sub_obj);
9195 Py_DECREF(str_obj);
9196 if (kind1 != kind && buf1)
9197 PyMem_Free(buf1);
9198 if (kind2 != kind && buf2)
9199 PyMem_Free(buf2);
9200 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201}
9202
Alexander Belopolsky40018472011-02-26 01:02:56 +00009203Py_ssize_t
9204PyUnicode_Find(PyObject *str,
9205 PyObject *sub,
9206 Py_ssize_t start,
9207 Py_ssize_t end,
9208 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009210 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009211
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 Py_DECREF(str);
9218 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 }
Tim Petersced69f82003-09-16 20:30:58 +00009220
Victor Stinner794d5672011-10-10 03:21:36 +02009221 result = any_find_slice(direction,
9222 str, sub, start, end
9223 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009226 Py_DECREF(sub);
9227
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 return result;
9229}
9230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231Py_ssize_t
9232PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9233 Py_ssize_t start, Py_ssize_t end,
9234 int direction)
9235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009237 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 if (PyUnicode_READY(str) == -1)
9239 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009240 if (start < 0 || end < 0) {
9241 PyErr_SetString(PyExc_IndexError, "string index out of range");
9242 return -2;
9243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (end > PyUnicode_GET_LENGTH(str))
9245 end = PyUnicode_GET_LENGTH(str);
9246 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009247 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9248 kind, end-start, ch, direction);
9249 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009251 else
9252 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253}
9254
Alexander Belopolsky40018472011-02-26 01:02:56 +00009255static int
9256tailmatch(PyUnicodeObject *self,
9257 PyUnicodeObject *substring,
9258 Py_ssize_t start,
9259 Py_ssize_t end,
9260 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 int kind_self;
9263 int kind_sub;
9264 void *data_self;
9265 void *data_sub;
9266 Py_ssize_t offset;
9267 Py_ssize_t i;
9268 Py_ssize_t end_sub;
9269
9270 if (PyUnicode_READY(self) == -1 ||
9271 PyUnicode_READY(substring) == -1)
9272 return 0;
9273
9274 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 return 1;
9276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9278 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009280 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 kind_self = PyUnicode_KIND(self);
9283 data_self = PyUnicode_DATA(self);
9284 kind_sub = PyUnicode_KIND(substring);
9285 data_sub = PyUnicode_DATA(substring);
9286 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9287
9288 if (direction > 0)
9289 offset = end;
9290 else
9291 offset = start;
9292
9293 if (PyUnicode_READ(kind_self, data_self, offset) ==
9294 PyUnicode_READ(kind_sub, data_sub, 0) &&
9295 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9296 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9297 /* If both are of the same kind, memcmp is sufficient */
9298 if (kind_self == kind_sub) {
9299 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009300 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 data_sub,
9302 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009303 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 }
9305 /* otherwise we have to compare each character by first accesing it */
9306 else {
9307 /* We do not need to compare 0 and len(substring)-1 because
9308 the if statement above ensured already that they are equal
9309 when we end up here. */
9310 // TODO: honor direction and do a forward or backwards search
9311 for (i = 1; i < end_sub; ++i) {
9312 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9313 PyUnicode_READ(kind_sub, data_sub, i))
9314 return 0;
9315 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 }
9319
9320 return 0;
9321}
9322
Alexander Belopolsky40018472011-02-26 01:02:56 +00009323Py_ssize_t
9324PyUnicode_Tailmatch(PyObject *str,
9325 PyObject *substr,
9326 Py_ssize_t start,
9327 Py_ssize_t end,
9328 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009330 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009331
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 str = PyUnicode_FromObject(str);
9333 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335 substr = PyUnicode_FromObject(substr);
9336 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 Py_DECREF(str);
9338 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339 }
Tim Petersced69f82003-09-16 20:30:58 +00009340
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 (PyUnicodeObject *)substr,
9343 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 Py_DECREF(str);
9345 Py_DECREF(substr);
9346 return result;
9347}
9348
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349/* Apply fixfct filter to the Unicode object self and return a
9350 reference to the modified object */
9351
Alexander Belopolsky40018472011-02-26 01:02:56 +00009352static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009353fixup(PyObject *self,
9354 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 PyObject *u;
9357 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 if (PyUnicode_READY(self) == -1)
9360 return NULL;
9361 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9362 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9363 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009368 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 /* fix functions return the new maximum character in a string,
9371 if the kind of the resulting unicode object does not change,
9372 everything is fine. Otherwise we need to change the string kind
9373 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009374 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 if (maxchar_new == 0)
9376 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9377 else if (maxchar_new <= 127)
9378 maxchar_new = 127;
9379 else if (maxchar_new <= 255)
9380 maxchar_new = 255;
9381 else if (maxchar_new <= 65535)
9382 maxchar_new = 65535;
9383 else
9384 maxchar_new = 1114111; /* 0x10ffff */
9385
9386 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 /* fixfct should return TRUE if it modified the buffer. If
9388 FALSE, return a reference to the original buffer instead
9389 (to save space, not time) */
9390 Py_INCREF(self);
9391 Py_DECREF(u);
9392 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 else if (maxchar_new == maxchar_old) {
9395 return u;
9396 }
9397 else {
9398 /* In case the maximum character changed, we need to
9399 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009400 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (v == NULL) {
9402 Py_DECREF(u);
9403 return NULL;
9404 }
9405 if (maxchar_new > maxchar_old) {
9406 /* If the maxchar increased so that the kind changed, not all
9407 characters are representable anymore and we need to fix the
9408 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009409 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009410 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9412 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009413 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009414 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416
9417 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009418 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 return v;
9420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421}
9422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009424fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 /* No need to call PyUnicode_READY(self) because this function is only
9427 called as a callback from fixup() which does it already. */
9428 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9429 const int kind = PyUnicode_KIND(self);
9430 void *data = PyUnicode_DATA(self);
9431 int touched = 0;
9432 Py_UCS4 maxchar = 0;
9433 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 for (i = 0; i < len; ++i) {
9436 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9437 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9438 if (up != ch) {
9439 if (up > maxchar)
9440 maxchar = up;
9441 PyUnicode_WRITE(kind, data, i, up);
9442 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 else if (ch > maxchar)
9445 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 }
9447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 if (touched)
9449 return maxchar;
9450 else
9451 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452}
9453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009455fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9458 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9459 const int kind = PyUnicode_KIND(self);
9460 void *data = PyUnicode_DATA(self);
9461 int touched = 0;
9462 Py_UCS4 maxchar = 0;
9463 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 for(i = 0; i < len; ++i) {
9466 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9467 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9468 if (lo != ch) {
9469 if (lo > maxchar)
9470 maxchar = lo;
9471 PyUnicode_WRITE(kind, data, i, lo);
9472 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 else if (ch > maxchar)
9475 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 }
9477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 if (touched)
9479 return maxchar;
9480 else
9481 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482}
9483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009485fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9488 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9489 const int kind = PyUnicode_KIND(self);
9490 void *data = PyUnicode_DATA(self);
9491 int touched = 0;
9492 Py_UCS4 maxchar = 0;
9493 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 for(i = 0; i < len; ++i) {
9496 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9497 Py_UCS4 nu = 0;
9498
9499 if (Py_UNICODE_ISUPPER(ch))
9500 nu = Py_UNICODE_TOLOWER(ch);
9501 else if (Py_UNICODE_ISLOWER(ch))
9502 nu = Py_UNICODE_TOUPPER(ch);
9503
9504 if (nu != 0) {
9505 if (nu > maxchar)
9506 maxchar = nu;
9507 PyUnicode_WRITE(kind, data, i, nu);
9508 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 else if (ch > maxchar)
9511 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
9513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 if (touched)
9515 return maxchar;
9516 else
9517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009521fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9524 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9525 const int kind = PyUnicode_KIND(self);
9526 void *data = PyUnicode_DATA(self);
9527 int touched = 0;
9528 Py_UCS4 maxchar = 0;
9529 Py_ssize_t i = 0;
9530 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009531
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009532 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
9535 ch = PyUnicode_READ(kind, data, i);
9536 if (!Py_UNICODE_ISUPPER(ch)) {
9537 maxchar = Py_UNICODE_TOUPPER(ch);
9538 PyUnicode_WRITE(kind, data, i, maxchar);
9539 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 ++i;
9542 for(; i < len; ++i) {
9543 ch = PyUnicode_READ(kind, data, i);
9544 if (!Py_UNICODE_ISLOWER(ch)) {
9545 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9546 if (lo > maxchar)
9547 maxchar = lo;
9548 PyUnicode_WRITE(kind, data, i, lo);
9549 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 else if (ch > maxchar)
9552 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554
9555 if (touched)
9556 return maxchar;
9557 else
9558 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559}
9560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009562fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9565 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9566 const int kind = PyUnicode_KIND(self);
9567 void *data = PyUnicode_DATA(self);
9568 Py_UCS4 maxchar = 0;
9569 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 int previous_is_cased;
9571
9572 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 if (len == 1) {
9574 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9575 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9576 if (ti != ch) {
9577 PyUnicode_WRITE(kind, data, i, ti);
9578 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 }
9580 else
9581 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 for(; i < len; ++i) {
9585 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9586 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 nu = Py_UNICODE_TOTITLE(ch);
9592
9593 if (nu > maxchar)
9594 maxchar = nu;
9595 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009596
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 if (Py_UNICODE_ISLOWER(ch) ||
9598 Py_UNICODE_ISUPPER(ch) ||
9599 Py_UNICODE_ISTITLE(ch))
9600 previous_is_cased = 1;
9601 else
9602 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605}
9606
Tim Peters8ce9f162004-08-27 01:49:32 +00009607PyObject *
9608PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009611 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009613 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009614 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9615 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009616 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009618 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 int use_memcpy;
9621 unsigned char *res_data = NULL, *sep_data = NULL;
9622 PyObject *last_obj;
9623 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624
Tim Peters05eba1f2004-08-27 21:32:02 +00009625 fseq = PySequence_Fast(seq, "");
9626 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009628 }
9629
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009630 /* NOTE: the following code can't call back into Python code,
9631 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009632 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009633
Tim Peters05eba1f2004-08-27 21:32:02 +00009634 seqlen = PySequence_Fast_GET_SIZE(fseq);
9635 /* If empty sequence, return u"". */
9636 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009637 Py_DECREF(fseq);
9638 Py_INCREF(unicode_empty);
9639 res = unicode_empty;
9640 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009642
Tim Peters05eba1f2004-08-27 21:32:02 +00009643 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009644 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009645 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009646 if (seqlen == 1) {
9647 if (PyUnicode_CheckExact(items[0])) {
9648 res = items[0];
9649 Py_INCREF(res);
9650 Py_DECREF(fseq);
9651 return res;
9652 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009653 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009654 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009655 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009656 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009657 /* Set up sep and seplen */
9658 if (separator == NULL) {
9659 /* fall back to a blank space separator */
9660 sep = PyUnicode_FromOrdinal(' ');
9661 if (!sep)
9662 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009663 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009664 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009665 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009666 else {
9667 if (!PyUnicode_Check(separator)) {
9668 PyErr_Format(PyExc_TypeError,
9669 "separator: expected str instance,"
9670 " %.80s found",
9671 Py_TYPE(separator)->tp_name);
9672 goto onError;
9673 }
9674 if (PyUnicode_READY(separator))
9675 goto onError;
9676 sep = separator;
9677 seplen = PyUnicode_GET_LENGTH(separator);
9678 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9679 /* inc refcount to keep this code path symmetric with the
9680 above case of a blank separator */
9681 Py_INCREF(sep);
9682 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009683 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009684 }
9685
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009686 /* There are at least two things to join, or else we have a subclass
9687 * of str in the sequence.
9688 * Do a pre-pass to figure out the total amount of space we'll
9689 * need (sz), and see whether all argument are strings.
9690 */
9691 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009692#ifdef Py_DEBUG
9693 use_memcpy = 0;
9694#else
9695 use_memcpy = 1;
9696#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 for (i = 0; i < seqlen; i++) {
9698 const Py_ssize_t old_sz = sz;
9699 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (!PyUnicode_Check(item)) {
9701 PyErr_Format(PyExc_TypeError,
9702 "sequence item %zd: expected str instance,"
9703 " %.80s found",
9704 i, Py_TYPE(item)->tp_name);
9705 goto onError;
9706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (PyUnicode_READY(item) == -1)
9708 goto onError;
9709 sz += PyUnicode_GET_LENGTH(item);
9710 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009711 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009712 if (i != 0)
9713 sz += seplen;
9714 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9715 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 goto onError;
9718 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009719 if (use_memcpy && last_obj != NULL) {
9720 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9721 use_memcpy = 0;
9722 }
9723 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009724 }
Tim Petersced69f82003-09-16 20:30:58 +00009725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009727 if (res == NULL)
9728 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009729
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009730 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009731#ifdef Py_DEBUG
9732 use_memcpy = 0;
9733#else
9734 if (use_memcpy) {
9735 res_data = PyUnicode_1BYTE_DATA(res);
9736 kind = PyUnicode_KIND(res);
9737 if (seplen != 0)
9738 sep_data = PyUnicode_1BYTE_DATA(sep);
9739 }
9740#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009742 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009745 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009746 if (use_memcpy) {
9747 Py_MEMCPY(res_data,
9748 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009749 kind * seplen);
9750 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009751 }
9752 else {
9753 copy_characters(res, res_offset, sep, 0, seplen);
9754 res_offset += seplen;
9755 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009756 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009757 itemlen = PyUnicode_GET_LENGTH(item);
9758 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009759 if (use_memcpy) {
9760 Py_MEMCPY(res_data,
9761 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009762 kind * itemlen);
9763 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009764 }
9765 else {
9766 copy_characters(res, res_offset, item, 0, itemlen);
9767 res_offset += itemlen;
9768 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009769 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009770 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009771 if (use_memcpy)
9772 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009773 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009774 else
9775 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009776
Tim Peters05eba1f2004-08-27 21:32:02 +00009777 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009779 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781
Benjamin Peterson29060642009-01-31 22:14:21 +00009782 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009783 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009785 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 return NULL;
9787}
9788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789#define FILL(kind, data, value, start, length) \
9790 do { \
9791 Py_ssize_t i_ = 0; \
9792 assert(kind != PyUnicode_WCHAR_KIND); \
9793 switch ((kind)) { \
9794 case PyUnicode_1BYTE_KIND: { \
9795 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9796 memset(to_, (unsigned char)value, length); \
9797 break; \
9798 } \
9799 case PyUnicode_2BYTE_KIND: { \
9800 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9801 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9802 break; \
9803 } \
9804 default: { \
9805 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9806 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9807 break; \
9808 } \
9809 } \
9810 } while (0)
9811
Victor Stinner9310abb2011-10-05 00:59:23 +02009812static PyObject *
9813pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009814 Py_ssize_t left,
9815 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 PyObject *u;
9819 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009820 int kind;
9821 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822
9823 if (left < 0)
9824 left = 0;
9825 if (right < 0)
9826 right = 0;
9827
Tim Peters7a29bd52001-09-12 03:03:31 +00009828 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829 Py_INCREF(self);
9830 return self;
9831 }
9832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9834 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009835 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9836 return NULL;
9837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9839 if (fill > maxchar)
9840 maxchar = fill;
9841 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009842 if (!u)
9843 return NULL;
9844
9845 kind = PyUnicode_KIND(u);
9846 data = PyUnicode_DATA(u);
9847 if (left)
9848 FILL(kind, data, fill, 0, left);
9849 if (right)
9850 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009851 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009852 assert(_PyUnicode_CheckConsistency(u, 1));
9853 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856
Alexander Belopolsky40018472011-02-26 01:02:56 +00009857PyObject *
9858PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
9862 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 switch(PyUnicode_KIND(string)) {
9867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009868 if (PyUnicode_IS_ASCII(string))
9869 list = asciilib_splitlines(
9870 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9871 PyUnicode_GET_LENGTH(string), keepends);
9872 else
9873 list = ucs1lib_splitlines(
9874 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9875 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_2BYTE_KIND:
9878 list = ucs2lib_splitlines(
9879 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9880 PyUnicode_GET_LENGTH(string), keepends);
9881 break;
9882 case PyUnicode_4BYTE_KIND:
9883 list = ucs4lib_splitlines(
9884 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9885 PyUnicode_GET_LENGTH(string), keepends);
9886 break;
9887 default:
9888 assert(0);
9889 list = 0;
9890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 Py_DECREF(string);
9892 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893}
9894
Alexander Belopolsky40018472011-02-26 01:02:56 +00009895static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009896split(PyObject *self,
9897 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009898 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 int kind1, kind2, kind;
9901 void *buf1, *buf2;
9902 Py_ssize_t len1, len2;
9903 PyObject* out;
9904
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009906 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (PyUnicode_READY(self) == -1)
9909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 if (substring == NULL)
9912 switch(PyUnicode_KIND(self)) {
9913 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009914 if (PyUnicode_IS_ASCII(self))
9915 return asciilib_split_whitespace(
9916 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9917 PyUnicode_GET_LENGTH(self), maxcount
9918 );
9919 else
9920 return ucs1lib_split_whitespace(
9921 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9922 PyUnicode_GET_LENGTH(self), maxcount
9923 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 case PyUnicode_2BYTE_KIND:
9925 return ucs2lib_split_whitespace(
9926 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9927 PyUnicode_GET_LENGTH(self), maxcount
9928 );
9929 case PyUnicode_4BYTE_KIND:
9930 return ucs4lib_split_whitespace(
9931 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9932 PyUnicode_GET_LENGTH(self), maxcount
9933 );
9934 default:
9935 assert(0);
9936 return NULL;
9937 }
9938
9939 if (PyUnicode_READY(substring) == -1)
9940 return NULL;
9941
9942 kind1 = PyUnicode_KIND(self);
9943 kind2 = PyUnicode_KIND(substring);
9944 kind = kind1 > kind2 ? kind1 : kind2;
9945 buf1 = PyUnicode_DATA(self);
9946 buf2 = PyUnicode_DATA(substring);
9947 if (kind1 != kind)
9948 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9949 if (!buf1)
9950 return NULL;
9951 if (kind2 != kind)
9952 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9953 if (!buf2) {
9954 if (kind1 != kind) PyMem_Free(buf1);
9955 return NULL;
9956 }
9957 len1 = PyUnicode_GET_LENGTH(self);
9958 len2 = PyUnicode_GET_LENGTH(substring);
9959
9960 switch(kind) {
9961 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009962 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9963 out = asciilib_split(
9964 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9965 else
9966 out = ucs1lib_split(
9967 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_2BYTE_KIND:
9970 out = ucs2lib_split(
9971 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9972 break;
9973 case PyUnicode_4BYTE_KIND:
9974 out = ucs4lib_split(
9975 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9976 break;
9977 default:
9978 out = NULL;
9979 }
9980 if (kind1 != kind)
9981 PyMem_Free(buf1);
9982 if (kind2 != kind)
9983 PyMem_Free(buf2);
9984 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985}
9986
Alexander Belopolsky40018472011-02-26 01:02:56 +00009987static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009988rsplit(PyObject *self,
9989 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009990 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 int kind1, kind2, kind;
9993 void *buf1, *buf2;
9994 Py_ssize_t len1, len2;
9995 PyObject* out;
9996
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009997 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009998 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 if (PyUnicode_READY(self) == -1)
10001 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 if (substring == NULL)
10004 switch(PyUnicode_KIND(self)) {
10005 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010006 if (PyUnicode_IS_ASCII(self))
10007 return asciilib_rsplit_whitespace(
10008 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10009 PyUnicode_GET_LENGTH(self), maxcount
10010 );
10011 else
10012 return ucs1lib_rsplit_whitespace(
10013 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10014 PyUnicode_GET_LENGTH(self), maxcount
10015 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 case PyUnicode_2BYTE_KIND:
10017 return ucs2lib_rsplit_whitespace(
10018 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
10019 PyUnicode_GET_LENGTH(self), maxcount
10020 );
10021 case PyUnicode_4BYTE_KIND:
10022 return ucs4lib_rsplit_whitespace(
10023 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
10024 PyUnicode_GET_LENGTH(self), maxcount
10025 );
10026 default:
10027 assert(0);
10028 return NULL;
10029 }
10030
10031 if (PyUnicode_READY(substring) == -1)
10032 return NULL;
10033
10034 kind1 = PyUnicode_KIND(self);
10035 kind2 = PyUnicode_KIND(substring);
10036 kind = kind1 > kind2 ? kind1 : kind2;
10037 buf1 = PyUnicode_DATA(self);
10038 buf2 = PyUnicode_DATA(substring);
10039 if (kind1 != kind)
10040 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10041 if (!buf1)
10042 return NULL;
10043 if (kind2 != kind)
10044 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10045 if (!buf2) {
10046 if (kind1 != kind) PyMem_Free(buf1);
10047 return NULL;
10048 }
10049 len1 = PyUnicode_GET_LENGTH(self);
10050 len2 = PyUnicode_GET_LENGTH(substring);
10051
10052 switch(kind) {
10053 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010054 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10055 out = asciilib_rsplit(
10056 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10057 else
10058 out = ucs1lib_rsplit(
10059 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 break;
10061 case PyUnicode_2BYTE_KIND:
10062 out = ucs2lib_rsplit(
10063 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10064 break;
10065 case PyUnicode_4BYTE_KIND:
10066 out = ucs4lib_rsplit(
10067 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10068 break;
10069 default:
10070 out = NULL;
10071 }
10072 if (kind1 != kind)
10073 PyMem_Free(buf1);
10074 if (kind2 != kind)
10075 PyMem_Free(buf2);
10076 return out;
10077}
10078
10079static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010080anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10081 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082{
10083 switch(kind) {
10084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010085 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10086 return asciilib_find(buf1, len1, buf2, len2, offset);
10087 else
10088 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 case PyUnicode_2BYTE_KIND:
10090 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10091 case PyUnicode_4BYTE_KIND:
10092 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10093 }
10094 assert(0);
10095 return -1;
10096}
10097
10098static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010099anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10100 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101{
10102 switch(kind) {
10103 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010104 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10105 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10106 else
10107 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 case PyUnicode_2BYTE_KIND:
10109 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10110 case PyUnicode_4BYTE_KIND:
10111 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10112 }
10113 assert(0);
10114 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010115}
10116
Alexander Belopolsky40018472011-02-26 01:02:56 +000010117static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118replace(PyObject *self, PyObject *str1,
10119 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 PyObject *u;
10122 char *sbuf = PyUnicode_DATA(self);
10123 char *buf1 = PyUnicode_DATA(str1);
10124 char *buf2 = PyUnicode_DATA(str2);
10125 int srelease = 0, release1 = 0, release2 = 0;
10126 int skind = PyUnicode_KIND(self);
10127 int kind1 = PyUnicode_KIND(str1);
10128 int kind2 = PyUnicode_KIND(str2);
10129 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10130 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10131 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010132 int mayshrink;
10133 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
10135 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Victor Stinner59de0ee2011-10-07 10:01:28 +020010140 if (str1 == str2)
10141 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (skind < kind1)
10143 /* substring too wide to be present */
10144 goto nothing;
10145
Victor Stinner49a0a212011-10-12 23:46:10 +020010146 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10147 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10148 /* Replacing str1 with str2 may cause a maxchar reduction in the
10149 result string. */
10150 mayshrink = (maxchar_str2 < maxchar);
10151 maxchar = Py_MAX(maxchar, maxchar_str2);
10152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010154 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010155 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010157 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010160 Py_UCS4 u1, u2;
10161 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010163 if (findchar(sbuf, PyUnicode_KIND(self),
10164 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010170 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 rkind = PyUnicode_KIND(u);
10172 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10173 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010174 if (--maxcount < 0)
10175 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010177 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010178 }
10179 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 int rkind = skind;
10181 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (kind1 < rkind) {
10184 /* widen substring */
10185 buf1 = _PyUnicode_AsKind(str1, rkind);
10186 if (!buf1) goto error;
10187 release1 = 1;
10188 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010190 if (i < 0)
10191 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 if (rkind > kind2) {
10193 /* widen replacement */
10194 buf2 = _PyUnicode_AsKind(str2, rkind);
10195 if (!buf2) goto error;
10196 release2 = 1;
10197 }
10198 else if (rkind < kind2) {
10199 /* widen self and buf1 */
10200 rkind = kind2;
10201 if (release1) PyMem_Free(buf1);
10202 sbuf = _PyUnicode_AsKind(self, rkind);
10203 if (!sbuf) goto error;
10204 srelease = 1;
10205 buf1 = _PyUnicode_AsKind(str1, rkind);
10206 if (!buf1) goto error;
10207 release1 = 1;
10208 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010209 u = PyUnicode_New(slen, maxchar);
10210 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010212 assert(PyUnicode_KIND(u) == rkind);
10213 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010214
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010216 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010221
10222 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010226 if (i == -1)
10227 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010234 }
10235 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 Py_ssize_t n, i, j, ires;
10237 Py_ssize_t product, new_size;
10238 int rkind = skind;
10239 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010242 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 buf1 = _PyUnicode_AsKind(str1, rkind);
10244 if (!buf1) goto error;
10245 release1 = 1;
10246 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010247 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 if (n == 0)
10249 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010251 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 buf2 = _PyUnicode_AsKind(str2, rkind);
10253 if (!buf2) goto error;
10254 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 rkind = kind2;
10259 sbuf = _PyUnicode_AsKind(self, rkind);
10260 if (!sbuf) goto error;
10261 srelease = 1;
10262 if (release1) PyMem_Free(buf1);
10263 buf1 = _PyUnicode_AsKind(str1, rkind);
10264 if (!buf1) goto error;
10265 release1 = 1;
10266 }
10267 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10268 PyUnicode_GET_LENGTH(str1))); */
10269 product = n * (len2-len1);
10270 if ((product / (len2-len1)) != n) {
10271 PyErr_SetString(PyExc_OverflowError,
10272 "replace string is too long");
10273 goto error;
10274 }
10275 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010276 if (new_size == 0) {
10277 Py_INCREF(unicode_empty);
10278 u = unicode_empty;
10279 goto done;
10280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10282 PyErr_SetString(PyExc_OverflowError,
10283 "replace string is too long");
10284 goto error;
10285 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 u = PyUnicode_New(new_size, maxchar);
10287 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010289 assert(PyUnicode_KIND(u) == rkind);
10290 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 ires = i = 0;
10292 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 while (n-- > 0) {
10294 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010296 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010297 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010298 if (j == -1)
10299 break;
10300 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010302 memcpy(res + rkind * ires,
10303 sbuf + rkind * i,
10304 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010306 }
10307 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010309 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010311 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010318 memcpy(res + rkind * ires,
10319 sbuf + rkind * i,
10320 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010321 }
10322 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 /* interleave */
10324 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010325 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010327 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010329 if (--n <= 0)
10330 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 memcpy(res + rkind * ires,
10332 sbuf + rkind * i,
10333 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 ires++;
10335 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010336 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010337 memcpy(res + rkind * ires,
10338 sbuf + rkind * i,
10339 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010340 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010341 }
10342
10343 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010344 unicode_adjust_maxchar(&u);
10345 if (u == NULL)
10346 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010348
10349 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (srelease)
10351 PyMem_FREE(sbuf);
10352 if (release1)
10353 PyMem_FREE(buf1);
10354 if (release2)
10355 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010356 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010358
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010360 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (srelease)
10362 PyMem_FREE(sbuf);
10363 if (release1)
10364 PyMem_FREE(buf1);
10365 if (release2)
10366 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367 if (PyUnicode_CheckExact(self)) {
10368 Py_INCREF(self);
10369 return (PyObject *) self;
10370 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010371 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 error:
10373 if (srelease && sbuf)
10374 PyMem_FREE(sbuf);
10375 if (release1 && buf1)
10376 PyMem_FREE(buf1);
10377 if (release2 && buf2)
10378 PyMem_FREE(buf2);
10379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380}
10381
10382/* --- Unicode Object Methods --------------------------------------------- */
10383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386\n\
10387Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010388characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389
10390static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010391unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 return fixup(self, fixtitle);
10394}
10395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010396PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010397 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398\n\
10399Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010400have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
10402static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010403unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 return fixup(self, fixcapitalize);
10406}
10407
10408#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411\n\
10412Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010413normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414
10415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010416unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417{
10418 PyObject *list;
10419 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010420 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 /* Split into words */
10423 list = split(self, NULL, -1);
10424 if (!list)
10425 return NULL;
10426
10427 /* Capitalize each word */
10428 for (i = 0; i < PyList_GET_SIZE(list); i++) {
10429 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 if (item == NULL)
10432 goto onError;
10433 Py_DECREF(PyList_GET_ITEM(list, i));
10434 PyList_SET_ITEM(list, i, item);
10435 }
10436
10437 /* Join the words to form a new string */
10438 item = PyUnicode_Join(NULL, list);
10439
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 Py_DECREF(list);
10442 return (PyObject *)item;
10443}
10444#endif
10445
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010446/* Argument converter. Coerces to a single unicode character */
10447
10448static int
10449convert_uc(PyObject *obj, void *addr)
10450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010452 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010453
Benjamin Peterson14339b62009-01-31 16:36:08 +000010454 uniobj = PyUnicode_FromObject(obj);
10455 if (uniobj == NULL) {
10456 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010457 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010458 return 0;
10459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010462 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010463 Py_DECREF(uniobj);
10464 return 0;
10465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 Py_DECREF(uniobj);
10468 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010469}
10470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010471PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010474Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010475done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476
10477static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010478unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010480 Py_ssize_t marg, left;
10481 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 Py_UCS4 fillchar = ' ';
10483
Victor Stinnere9a29352011-10-01 02:14:59 +020010484 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486
Victor Stinnere9a29352011-10-01 02:14:59 +020010487 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488 return NULL;
10489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 Py_INCREF(self);
10492 return (PyObject*) self;
10493 }
10494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496 left = marg / 2 + (marg & width & 1);
10497
Victor Stinner9310abb2011-10-05 00:59:23 +020010498 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499}
10500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501/* This function assumes that str1 and str2 are readied by the caller. */
10502
Marc-André Lemburge5034372000-08-08 08:04:29 +000010503static int
10504unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 int kind1, kind2;
10507 void *data1, *data2;
10508 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 kind1 = PyUnicode_KIND(str1);
10511 kind2 = PyUnicode_KIND(str2);
10512 data1 = PyUnicode_DATA(str1);
10513 data2 = PyUnicode_DATA(str2);
10514 len1 = PyUnicode_GET_LENGTH(str1);
10515 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 for (i = 0; i < len1 && i < len2; ++i) {
10518 Py_UCS4 c1, c2;
10519 c1 = PyUnicode_READ(kind1, data1, i);
10520 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010521
10522 if (c1 != c2)
10523 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010524 }
10525
10526 return (len1 < len2) ? -1 : (len1 != len2);
10527}
10528
Alexander Belopolsky40018472011-02-26 01:02:56 +000010529int
10530PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10533 if (PyUnicode_READY(left) == -1 ||
10534 PyUnicode_READY(right) == -1)
10535 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010536 return unicode_compare((PyUnicodeObject *)left,
10537 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010539 PyErr_Format(PyExc_TypeError,
10540 "Can't compare %.100s and %.100s",
10541 left->ob_type->tp_name,
10542 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 return -1;
10544}
10545
Martin v. Löwis5b222132007-06-10 09:51:05 +000010546int
10547PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 Py_ssize_t i;
10550 int kind;
10551 void *data;
10552 Py_UCS4 chr;
10553
Victor Stinner910337b2011-10-03 03:20:16 +020010554 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (PyUnicode_READY(uni) == -1)
10556 return -1;
10557 kind = PyUnicode_KIND(uni);
10558 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010559 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10561 if (chr != str[i])
10562 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010563 /* This check keeps Python strings that end in '\0' from comparing equal
10564 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010567 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010569 return 0;
10570}
10571
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010572
Benjamin Peterson29060642009-01-31 22:14:21 +000010573#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010574 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010575
Alexander Belopolsky40018472011-02-26 01:02:56 +000010576PyObject *
10577PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010578{
10579 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010581 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10582 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (PyUnicode_READY(left) == -1 ||
10584 PyUnicode_READY(right) == -1)
10585 return NULL;
10586 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10587 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010588 if (op == Py_EQ) {
10589 Py_INCREF(Py_False);
10590 return Py_False;
10591 }
10592 if (op == Py_NE) {
10593 Py_INCREF(Py_True);
10594 return Py_True;
10595 }
10596 }
10597 if (left == right)
10598 result = 0;
10599 else
10600 result = unicode_compare((PyUnicodeObject *)left,
10601 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010602
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010603 /* Convert the return value to a Boolean */
10604 switch (op) {
10605 case Py_EQ:
10606 v = TEST_COND(result == 0);
10607 break;
10608 case Py_NE:
10609 v = TEST_COND(result != 0);
10610 break;
10611 case Py_LE:
10612 v = TEST_COND(result <= 0);
10613 break;
10614 case Py_GE:
10615 v = TEST_COND(result >= 0);
10616 break;
10617 case Py_LT:
10618 v = TEST_COND(result == -1);
10619 break;
10620 case Py_GT:
10621 v = TEST_COND(result == 1);
10622 break;
10623 default:
10624 PyErr_BadArgument();
10625 return NULL;
10626 }
10627 Py_INCREF(v);
10628 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010630
Brian Curtindfc80e32011-08-10 20:28:54 -050010631 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010632}
10633
Alexander Belopolsky40018472011-02-26 01:02:56 +000010634int
10635PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010636{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 int kind1, kind2, kind;
10639 void *buf1, *buf2;
10640 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010641 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010642
10643 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 sub = PyUnicode_FromObject(element);
10645 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 PyErr_Format(PyExc_TypeError,
10647 "'in <string>' requires string as left operand, not %s",
10648 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (PyUnicode_READY(sub) == -1)
10652 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010653
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010655 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 Py_DECREF(sub);
10657 return -1;
10658 }
10659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 kind1 = PyUnicode_KIND(str);
10661 kind2 = PyUnicode_KIND(sub);
10662 kind = kind1 > kind2 ? kind1 : kind2;
10663 buf1 = PyUnicode_DATA(str);
10664 buf2 = PyUnicode_DATA(sub);
10665 if (kind1 != kind)
10666 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10667 if (!buf1) {
10668 Py_DECREF(sub);
10669 return -1;
10670 }
10671 if (kind2 != kind)
10672 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10673 if (!buf2) {
10674 Py_DECREF(sub);
10675 if (kind1 != kind) PyMem_Free(buf1);
10676 return -1;
10677 }
10678 len1 = PyUnicode_GET_LENGTH(str);
10679 len2 = PyUnicode_GET_LENGTH(sub);
10680
10681 switch(kind) {
10682 case PyUnicode_1BYTE_KIND:
10683 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10684 break;
10685 case PyUnicode_2BYTE_KIND:
10686 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10687 break;
10688 case PyUnicode_4BYTE_KIND:
10689 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10690 break;
10691 default:
10692 result = -1;
10693 assert(0);
10694 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695
10696 Py_DECREF(str);
10697 Py_DECREF(sub);
10698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (kind1 != kind)
10700 PyMem_Free(buf1);
10701 if (kind2 != kind)
10702 PyMem_Free(buf2);
10703
Guido van Rossum403d68b2000-03-13 15:55:09 +000010704 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010705}
10706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707/* Concat to string or Unicode object giving a new Unicode object. */
10708
Alexander Belopolsky40018472011-02-26 01:02:56 +000010709PyObject *
10710PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010713 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010724 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010728 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 }
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010734 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10735 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 w = PyUnicode_New(
10739 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10740 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010743 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10744 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 Py_DECREF(u);
10746 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010747 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 Py_XDECREF(u);
10752 Py_XDECREF(v);
10753 return NULL;
10754}
10755
Victor Stinnerb0923652011-10-04 01:17:31 +020010756static void
10757unicode_append_inplace(PyObject **p_left, PyObject *right)
10758{
10759 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010760
10761 assert(PyUnicode_IS_READY(*p_left));
10762 assert(PyUnicode_IS_READY(right));
10763
10764 left_len = PyUnicode_GET_LENGTH(*p_left);
10765 right_len = PyUnicode_GET_LENGTH(right);
10766 if (left_len > PY_SSIZE_T_MAX - right_len) {
10767 PyErr_SetString(PyExc_OverflowError,
10768 "strings are too large to concat");
10769 goto error;
10770 }
10771 new_len = left_len + right_len;
10772
10773 /* Now we own the last reference to 'left', so we can resize it
10774 * in-place.
10775 */
10776 if (unicode_resize(p_left, new_len) != 0) {
10777 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10778 * deallocated so it cannot be put back into
10779 * 'variable'. The MemoryError is raised when there
10780 * is no value in 'variable', which might (very
10781 * remotely) be a cause of incompatibilities.
10782 */
10783 goto error;
10784 }
10785 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010786 copy_characters(*p_left, left_len, right, 0, right_len);
10787 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010788 return;
10789
10790error:
10791 Py_DECREF(*p_left);
10792 *p_left = NULL;
10793}
10794
Walter Dörwald1ab83302007-05-18 17:15:44 +000010795void
Victor Stinner23e56682011-10-03 03:54:37 +020010796PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010797{
Victor Stinner23e56682011-10-03 03:54:37 +020010798 PyObject *left, *res;
10799
10800 if (p_left == NULL) {
10801 if (!PyErr_Occurred())
10802 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010803 return;
10804 }
Victor Stinner23e56682011-10-03 03:54:37 +020010805 left = *p_left;
10806 if (right == NULL || !PyUnicode_Check(left)) {
10807 if (!PyErr_Occurred())
10808 PyErr_BadInternalCall();
10809 goto error;
10810 }
10811
Victor Stinnere1335c72011-10-04 20:53:03 +020010812 if (PyUnicode_READY(left))
10813 goto error;
10814 if (PyUnicode_READY(right))
10815 goto error;
10816
Victor Stinner23e56682011-10-03 03:54:37 +020010817 if (PyUnicode_CheckExact(left) && left != unicode_empty
10818 && PyUnicode_CheckExact(right) && right != unicode_empty
10819 && unicode_resizable(left)
10820 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10821 || _PyUnicode_WSTR(left) != NULL))
10822 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010823 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10824 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010825 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010826 not so different than duplicating the string. */
10827 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010828 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010829 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010830 if (p_left != NULL)
10831 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010832 return;
10833 }
10834 }
10835
10836 res = PyUnicode_Concat(left, right);
10837 if (res == NULL)
10838 goto error;
10839 Py_DECREF(left);
10840 *p_left = res;
10841 return;
10842
10843error:
10844 Py_DECREF(*p_left);
10845 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010846}
10847
10848void
10849PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10850{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010851 PyUnicode_Append(pleft, right);
10852 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010853}
10854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010855PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010858Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010859string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010860interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861
10862static PyObject *
10863unicode_count(PyUnicodeObject *self, PyObject *args)
10864{
10865 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010866 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010867 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 int kind1, kind2, kind;
10870 void *buf1, *buf2;
10871 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872
Jesus Ceaac451502011-04-20 17:09:23 +020010873 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10874 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 kind1 = PyUnicode_KIND(self);
10878 kind2 = PyUnicode_KIND(substring);
10879 kind = kind1 > kind2 ? kind1 : kind2;
10880 buf1 = PyUnicode_DATA(self);
10881 buf2 = PyUnicode_DATA(substring);
10882 if (kind1 != kind)
10883 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10884 if (!buf1) {
10885 Py_DECREF(substring);
10886 return NULL;
10887 }
10888 if (kind2 != kind)
10889 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10890 if (!buf2) {
10891 Py_DECREF(substring);
10892 if (kind1 != kind) PyMem_Free(buf1);
10893 return NULL;
10894 }
10895 len1 = PyUnicode_GET_LENGTH(self);
10896 len2 = PyUnicode_GET_LENGTH(substring);
10897
10898 ADJUST_INDICES(start, end, len1);
10899 switch(kind) {
10900 case PyUnicode_1BYTE_KIND:
10901 iresult = ucs1lib_count(
10902 ((Py_UCS1*)buf1) + start, end - start,
10903 buf2, len2, PY_SSIZE_T_MAX
10904 );
10905 break;
10906 case PyUnicode_2BYTE_KIND:
10907 iresult = ucs2lib_count(
10908 ((Py_UCS2*)buf1) + start, end - start,
10909 buf2, len2, PY_SSIZE_T_MAX
10910 );
10911 break;
10912 case PyUnicode_4BYTE_KIND:
10913 iresult = ucs4lib_count(
10914 ((Py_UCS4*)buf1) + start, end - start,
10915 buf2, len2, PY_SSIZE_T_MAX
10916 );
10917 break;
10918 default:
10919 assert(0); iresult = 0;
10920 }
10921
10922 result = PyLong_FromSsize_t(iresult);
10923
10924 if (kind1 != kind)
10925 PyMem_Free(buf1);
10926 if (kind2 != kind)
10927 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
10929 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010930
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931 return result;
10932}
10933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010934PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010935 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010937Encode S using the codec registered for encoding. Default encoding\n\
10938is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010939handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010940a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10941'xmlcharrefreplace' as well as any other name registered with\n\
10942codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
10944static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010945unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010947 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 char *encoding = NULL;
10949 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010950
Benjamin Peterson308d6372009-09-18 21:42:35 +000010951 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10952 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010954 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010955}
10956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010957PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010958 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959\n\
10960Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963static PyObject*
10964unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10965{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010966 Py_ssize_t i, j, line_pos, src_len, incr;
10967 Py_UCS4 ch;
10968 PyObject *u;
10969 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010971 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010972 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
10974 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
Antoine Pitrou22425222011-10-04 19:10:51 +020010977 if (PyUnicode_READY(self) == -1)
10978 return NULL;
10979
Thomas Wouters7e474022000-07-16 12:04:32 +000010980 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010981 src_len = PyUnicode_GET_LENGTH(self);
10982 i = j = line_pos = 0;
10983 kind = PyUnicode_KIND(self);
10984 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010985 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010986 for (; i < src_len; i++) {
10987 ch = PyUnicode_READ(kind, src_data, i);
10988 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010989 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010991 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010993 goto overflow;
10994 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010996 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011000 goto overflow;
11001 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011003 if (ch == '\n' || ch == '\r')
11004 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011007 if (!found && PyUnicode_CheckExact(self)) {
11008 Py_INCREF((PyObject *) self);
11009 return (PyObject *) self;
11010 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011011
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011013 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 if (!u)
11015 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017
Antoine Pitroue71d5742011-10-04 15:55:09 +020011018 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 for (; i < src_len; i++) {
11021 ch = PyUnicode_READ(kind, src_data, i);
11022 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 incr = tabsize - (line_pos % tabsize);
11025 line_pos += incr;
11026 while (incr--) {
11027 PyUnicode_WRITE(kind, dest_data, j, ' ');
11028 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011031 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 line_pos++;
11034 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011035 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011036 if (ch == '\n' || ch == '\r')
11037 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 }
11040 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011041#ifndef DONT_MAKE_RESULT_READY
11042 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 Py_DECREF(u);
11044 return NULL;
11045 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011046#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011047 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011049
Antoine Pitroue71d5742011-10-04 15:55:09 +020011050 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011051 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053}
11054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011055PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057\n\
11058Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011059such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060arguments start and end are interpreted as in slice notation.\n\
11061\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011062Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
11064static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Jesus Ceaac451502011-04-20 17:09:23 +020011067 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011068 Py_ssize_t start;
11069 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
Jesus Ceaac451502011-04-20 17:09:23 +020011072 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11073 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078 if (PyUnicode_READY(substring) == -1)
11079 return NULL;
11080
Victor Stinner794d5672011-10-10 03:21:36 +020011081 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011083 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (result == -2)
11088 return NULL;
11089
Christian Heimes217cfd12007-12-02 14:31:20 +000011090 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091}
11092
11093static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011094unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011096 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11097 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100}
11101
Guido van Rossumc2504932007-09-18 19:42:40 +000011102/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011103 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011104static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000011105unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106{
Guido van Rossumc2504932007-09-18 19:42:40 +000011107 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011108 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (_PyUnicode_HASH(self) != -1)
11111 return _PyUnicode_HASH(self);
11112 if (PyUnicode_READY(self) == -1)
11113 return -1;
11114 len = PyUnicode_GET_LENGTH(self);
11115
11116 /* The hash function as a macro, gets expanded three times below. */
11117#define HASH(P) \
11118 x = (Py_uhash_t)*P << 7; \
11119 while (--len >= 0) \
11120 x = (1000003*x) ^ (Py_uhash_t)*P++;
11121
11122 switch (PyUnicode_KIND(self)) {
11123 case PyUnicode_1BYTE_KIND: {
11124 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11125 HASH(c);
11126 break;
11127 }
11128 case PyUnicode_2BYTE_KIND: {
11129 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11130 HASH(s);
11131 break;
11132 }
11133 default: {
11134 Py_UCS4 *l;
11135 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11136 "Impossible switch case in unicode_hash");
11137 l = PyUnicode_4BYTE_DATA(self);
11138 HASH(l);
11139 break;
11140 }
11141 }
11142 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11143
Guido van Rossumc2504932007-09-18 19:42:40 +000011144 if (x == -1)
11145 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011147 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011151PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011154Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155
11156static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011159 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020011160 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011161 Py_ssize_t start;
11162 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163
Jesus Ceaac451502011-04-20 17:09:23 +020011164 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11165 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 if (PyUnicode_READY(self) == -1)
11169 return NULL;
11170 if (PyUnicode_READY(substring) == -1)
11171 return NULL;
11172
Victor Stinner794d5672011-10-10 03:21:36 +020011173 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011175 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
11177 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (result == -2)
11180 return NULL;
11181
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 if (result < 0) {
11183 PyErr_SetString(PyExc_ValueError, "substring not found");
11184 return NULL;
11185 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186
Christian Heimes217cfd12007-12-02 14:31:20 +000011187 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188}
11189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011190PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011193Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011194at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195
11196static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011197unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 Py_ssize_t i, length;
11200 int kind;
11201 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 int cased;
11203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (PyUnicode_READY(self) == -1)
11205 return NULL;
11206 length = PyUnicode_GET_LENGTH(self);
11207 kind = PyUnicode_KIND(self);
11208 data = PyUnicode_DATA(self);
11209
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (length == 1)
11212 return PyBool_FromLong(
11213 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011215 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011218
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 for (i = 0; i < length; i++) {
11221 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011222
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11224 return PyBool_FromLong(0);
11225 else if (!cased && Py_UNICODE_ISLOWER(ch))
11226 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011228 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229}
11230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011234Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
11237static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011238unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 Py_ssize_t i, length;
11241 int kind;
11242 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243 int cased;
11244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (PyUnicode_READY(self) == -1)
11246 return NULL;
11247 length = PyUnicode_GET_LENGTH(self);
11248 kind = PyUnicode_KIND(self);
11249 data = PyUnicode_DATA(self);
11250
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 if (length == 1)
11253 return PyBool_FromLong(
11254 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011256 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011259
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 for (i = 0; i < length; i++) {
11262 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011263
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11265 return PyBool_FromLong(0);
11266 else if (!cased && Py_UNICODE_ISUPPER(ch))
11267 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011269 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270}
11271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011275Return True if S is a titlecased string and there is at least one\n\
11276character in S, i.e. upper- and titlecase characters may only\n\
11277follow uncased characters and lowercase characters only cased ones.\n\
11278Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
11280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011281unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 Py_ssize_t i, length;
11284 int kind;
11285 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 int cased, previous_is_cased;
11287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (PyUnicode_READY(self) == -1)
11289 return NULL;
11290 length = PyUnicode_GET_LENGTH(self);
11291 kind = PyUnicode_KIND(self);
11292 data = PyUnicode_DATA(self);
11293
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (length == 1) {
11296 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11297 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11298 (Py_UNICODE_ISUPPER(ch) != 0));
11299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011301 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011304
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 cased = 0;
11306 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 for (i = 0; i < length; i++) {
11308 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011309
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11311 if (previous_is_cased)
11312 return PyBool_FromLong(0);
11313 previous_is_cased = 1;
11314 cased = 1;
11315 }
11316 else if (Py_UNICODE_ISLOWER(ch)) {
11317 if (!previous_is_cased)
11318 return PyBool_FromLong(0);
11319 previous_is_cased = 1;
11320 cased = 1;
11321 }
11322 else
11323 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011325 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326}
11327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011328PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011331Return True if all characters in S are whitespace\n\
11332and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333
11334static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011335unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 Py_ssize_t i, length;
11338 int kind;
11339 void *data;
11340
11341 if (PyUnicode_READY(self) == -1)
11342 return NULL;
11343 length = PyUnicode_GET_LENGTH(self);
11344 kind = PyUnicode_KIND(self);
11345 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 if (length == 1)
11349 return PyBool_FromLong(
11350 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011352 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 for (i = 0; i < length; i++) {
11357 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011358 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011361 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362}
11363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011366\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011367Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011368and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011369
11370static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011371unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 Py_ssize_t i, length;
11374 int kind;
11375 void *data;
11376
11377 if (PyUnicode_READY(self) == -1)
11378 return NULL;
11379 length = PyUnicode_GET_LENGTH(self);
11380 kind = PyUnicode_KIND(self);
11381 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011382
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (length == 1)
11385 return PyBool_FromLong(
11386 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387
11388 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 for (i = 0; i < length; i++) {
11393 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011396 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011401\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011402Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404
11405static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011406unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 int kind;
11409 void *data;
11410 Py_ssize_t len, i;
11411
11412 if (PyUnicode_READY(self) == -1)
11413 return NULL;
11414
11415 kind = PyUnicode_KIND(self);
11416 data = PyUnicode_DATA(self);
11417 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011419 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 if (len == 1) {
11421 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11422 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11423 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011424
11425 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 for (i = 0; i < len; i++) {
11430 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011431 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011434 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011444unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t i, length;
11447 int kind;
11448 void *data;
11449
11450 if (PyUnicode_READY(self) == -1)
11451 return NULL;
11452 length = PyUnicode_GET_LENGTH(self);
11453 kind = PyUnicode_KIND(self);
11454 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (length == 1)
11458 return PyBool_FromLong(
11459 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011461 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 for (i = 0; i < length; i++) {
11466 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011469 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470}
11471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011472PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011475Return True if all characters in S are digits\n\
11476and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011479unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 Py_ssize_t i, length;
11482 int kind;
11483 void *data;
11484
11485 if (PyUnicode_READY(self) == -1)
11486 return NULL;
11487 length = PyUnicode_GET_LENGTH(self);
11488 kind = PyUnicode_KIND(self);
11489 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (length == 1) {
11493 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11494 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011497 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 for (i = 0; i < length; i++) {
11502 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011505 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506}
11507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011508PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011511Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011512False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011515unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 Py_ssize_t i, length;
11518 int kind;
11519 void *data;
11520
11521 if (PyUnicode_READY(self) == -1)
11522 return NULL;
11523 length = PyUnicode_GET_LENGTH(self);
11524 kind = PyUnicode_KIND(self);
11525 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (length == 1)
11529 return PyBool_FromLong(
11530 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011532 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 for (i = 0; i < length; i++) {
11537 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011540 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541}
11542
Martin v. Löwis47383402007-08-15 07:32:56 +000011543int
11544PyUnicode_IsIdentifier(PyObject *self)
11545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 int kind;
11547 void *data;
11548 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011549 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (PyUnicode_READY(self) == -1) {
11552 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 }
11555
11556 /* Special case for empty strings */
11557 if (PyUnicode_GET_LENGTH(self) == 0)
11558 return 0;
11559 kind = PyUnicode_KIND(self);
11560 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011561
11562 /* PEP 3131 says that the first character must be in
11563 XID_Start and subsequent characters in XID_Continue,
11564 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011565 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011566 letters, digits, underscore). However, given the current
11567 definition of XID_Start and XID_Continue, it is sufficient
11568 to check just for these, except that _ must be allowed
11569 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011571 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011572 return 0;
11573
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011574 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011577 return 1;
11578}
11579
11580PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011582\n\
11583Return True if S is a valid identifier according\n\
11584to the language definition.");
11585
11586static PyObject*
11587unicode_isidentifier(PyObject *self)
11588{
11589 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11590}
11591
Georg Brandl559e5d72008-06-11 18:37:52 +000011592PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011594\n\
11595Return True if all characters in S are considered\n\
11596printable in repr() or S is empty, False otherwise.");
11597
11598static PyObject*
11599unicode_isprintable(PyObject *self)
11600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 Py_ssize_t i, length;
11602 int kind;
11603 void *data;
11604
11605 if (PyUnicode_READY(self) == -1)
11606 return NULL;
11607 length = PyUnicode_GET_LENGTH(self);
11608 kind = PyUnicode_KIND(self);
11609 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011610
11611 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 if (length == 1)
11613 return PyBool_FromLong(
11614 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 for (i = 0; i < length; i++) {
11617 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011618 Py_RETURN_FALSE;
11619 }
11620 }
11621 Py_RETURN_TRUE;
11622}
11623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011624PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011625 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626\n\
11627Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011628iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011631unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011633 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
Martin v. Löwis18e16552006-02-15 17:27:45 +000011636static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637unicode_length(PyUnicodeObject *self)
11638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (PyUnicode_READY(self) == -1)
11640 return -1;
11641 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642}
11643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011644PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011647Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011648done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
11650static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011651unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011653 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 Py_UCS4 fillchar = ' ';
11655
11656 if (PyUnicode_READY(self) == -1)
11657 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011658
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011659 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 return NULL;
11661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663 Py_INCREF(self);
11664 return (PyObject*) self;
11665 }
11666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668}
11669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011673Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
11675static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011676unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 return fixup(self, fixlower);
11679}
11680
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681#define LEFTSTRIP 0
11682#define RIGHTSTRIP 1
11683#define BOTHSTRIP 2
11684
11685/* Arrays indexed by above */
11686static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11687
11688#define STRIPNAME(i) (stripformat[i]+3)
11689
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690/* externally visible for str.strip(unicode) */
11691PyObject *
11692_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 void *data;
11695 int kind;
11696 Py_ssize_t i, j, len;
11697 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11700 return NULL;
11701
11702 kind = PyUnicode_KIND(self);
11703 data = PyUnicode_DATA(self);
11704 len = PyUnicode_GET_LENGTH(self);
11705 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11706 PyUnicode_DATA(sepobj),
11707 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 i = 0;
11710 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 while (i < len &&
11712 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 i++;
11714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
Benjamin Peterson14339b62009-01-31 16:36:08 +000011717 j = len;
11718 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 do {
11720 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 } while (j >= i &&
11722 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011725
Victor Stinner12bab6d2011-10-01 01:53:49 +020011726 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727}
11728
11729PyObject*
11730PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11731{
11732 unsigned char *data;
11733 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011734 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735
Victor Stinnerde636f32011-10-01 03:55:54 +020011736 if (PyUnicode_READY(self) == -1)
11737 return NULL;
11738
11739 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11740
Victor Stinner12bab6d2011-10-01 01:53:49 +020011741 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011743 if (PyUnicode_CheckExact(self)) {
11744 Py_INCREF(self);
11745 return self;
11746 }
11747 else
11748 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 }
11750
Victor Stinner12bab6d2011-10-01 01:53:49 +020011751 length = end - start;
11752 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011753 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754
Victor Stinnerde636f32011-10-01 03:55:54 +020011755 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011756 PyErr_SetString(PyExc_IndexError, "string index out of range");
11757 return NULL;
11758 }
11759
Victor Stinnerb9275c12011-10-05 14:01:42 +020011760 if (PyUnicode_IS_ASCII(self)) {
11761 kind = PyUnicode_KIND(self);
11762 data = PyUnicode_1BYTE_DATA(self);
11763 return unicode_fromascii(data + start, length);
11764 }
11765 else {
11766 kind = PyUnicode_KIND(self);
11767 data = PyUnicode_1BYTE_DATA(self);
11768 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011769 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011770 length);
11771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
11774static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 int kind;
11778 void *data;
11779 Py_ssize_t len, i, j;
11780
11781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783
11784 kind = PyUnicode_KIND(self);
11785 data = PyUnicode_DATA(self);
11786 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 i = 0;
11789 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011791 i++;
11792 }
11793 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 j = len;
11796 if (striptype != LEFTSTRIP) {
11797 do {
11798 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 j++;
11801 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802
Victor Stinner12bab6d2011-10-01 01:53:49 +020011803 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804}
11805
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011806
11807static PyObject *
11808do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11809{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011810 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011811
Benjamin Peterson14339b62009-01-31 16:36:08 +000011812 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11813 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011814
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 if (sep != NULL && sep != Py_None) {
11816 if (PyUnicode_Check(sep))
11817 return _PyUnicode_XStrip(self, striptype, sep);
11818 else {
11819 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "%s arg must be None or str",
11821 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 return NULL;
11823 }
11824 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011827}
11828
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832\n\
11833Return a copy of the string S with leading and trailing\n\
11834whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011835If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836
11837static PyObject *
11838unicode_strip(PyUnicodeObject *self, PyObject *args)
11839{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011840 if (PyTuple_GET_SIZE(args) == 0)
11841 return do_strip(self, BOTHSTRIP); /* Common case */
11842 else
11843 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844}
11845
11846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011847PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011849\n\
11850Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011851If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011852
11853static PyObject *
11854unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11855{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011856 if (PyTuple_GET_SIZE(args) == 0)
11857 return do_strip(self, LEFTSTRIP); /* Common case */
11858 else
11859 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011860}
11861
11862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011863PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011865\n\
11866Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011867If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011868
11869static PyObject *
11870unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11871{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011872 if (PyTuple_GET_SIZE(args) == 0)
11873 return do_strip(self, RIGHTSTRIP); /* Common case */
11874 else
11875 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011876}
11877
11878
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011880unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881{
11882 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Georg Brandl222de0f2009-04-12 12:01:50 +000011885 if (len < 1) {
11886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011887 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Tim Peters7a29bd52001-09-12 03:03:31 +000011890 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 /* no repeat, return original string */
11892 Py_INCREF(str);
11893 return (PyObject*) str;
11894 }
Tim Peters8f422462000-09-09 06:13:41 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (PyUnicode_READY(str) == -1)
11897 return NULL;
11898
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011899 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011900 PyErr_SetString(PyExc_OverflowError,
11901 "repeated string is too long");
11902 return NULL;
11903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 if (!u)
11908 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011909 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_GET_LENGTH(str) == 1) {
11912 const int kind = PyUnicode_KIND(str);
11913 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11914 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011915 if (kind == PyUnicode_1BYTE_KIND)
11916 memset(to, (unsigned char)fill_char, len);
11917 else {
11918 for (n = 0; n < len; ++n)
11919 PyUnicode_WRITE(kind, to, n, fill_char);
11920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 }
11922 else {
11923 /* number of characters copied this far */
11924 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011925 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 char *to = (char *) PyUnicode_DATA(u);
11927 Py_MEMCPY(to, PyUnicode_DATA(str),
11928 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 n = (done <= nchars-done) ? done : nchars-done;
11931 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011932 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
11935
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011936 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 return (PyObject*) u;
11938}
11939
Alexander Belopolsky40018472011-02-26 01:02:56 +000011940PyObject *
11941PyUnicode_Replace(PyObject *obj,
11942 PyObject *subobj,
11943 PyObject *replobj,
11944 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
11946 PyObject *self;
11947 PyObject *str1;
11948 PyObject *str2;
11949 PyObject *result;
11950
11951 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011952 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011955 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 Py_DECREF(self);
11957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 }
11959 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011960 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 Py_DECREF(self);
11962 Py_DECREF(str1);
11963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 Py_DECREF(self);
11967 Py_DECREF(str1);
11968 Py_DECREF(str2);
11969 return result;
11970}
11971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011972PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011973 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974\n\
11975Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011976old replaced by new. If the optional argument count is\n\
11977given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
11979static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 PyObject *str1;
11983 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011984 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 PyObject *result;
11986
Martin v. Löwis18e16552006-02-15 17:27:45 +000011987 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 str1 = PyUnicode_FromObject(str1);
11992 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11993 return NULL;
11994 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011995 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 Py_DECREF(str1);
11997 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
12000 result = replace(self, str1, str2, maxcount);
12001
12002 Py_DECREF(str1);
12003 Py_DECREF(str2);
12004 return result;
12005}
12006
Alexander Belopolsky40018472011-02-26 01:02:56 +000012007static PyObject *
12008unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 Py_ssize_t isize;
12012 Py_ssize_t osize, squote, dquote, i, o;
12013 Py_UCS4 max, quote;
12014 int ikind, okind;
12015 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012018 return NULL;
12019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 isize = PyUnicode_GET_LENGTH(unicode);
12021 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 /* Compute length of output, quote characters, and
12024 maximum character */
12025 osize = 2; /* quotes */
12026 max = 127;
12027 squote = dquote = 0;
12028 ikind = PyUnicode_KIND(unicode);
12029 for (i = 0; i < isize; i++) {
12030 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12031 switch (ch) {
12032 case '\'': squote++; osize++; break;
12033 case '"': dquote++; osize++; break;
12034 case '\\': case '\t': case '\r': case '\n':
12035 osize += 2; break;
12036 default:
12037 /* Fast-path ASCII */
12038 if (ch < ' ' || ch == 0x7f)
12039 osize += 4; /* \xHH */
12040 else if (ch < 0x7f)
12041 osize++;
12042 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12043 osize++;
12044 max = ch > max ? ch : max;
12045 }
12046 else if (ch < 0x100)
12047 osize += 4; /* \xHH */
12048 else if (ch < 0x10000)
12049 osize += 6; /* \uHHHH */
12050 else
12051 osize += 10; /* \uHHHHHHHH */
12052 }
12053 }
12054
12055 quote = '\'';
12056 if (squote) {
12057 if (dquote)
12058 /* Both squote and dquote present. Use squote,
12059 and escape them */
12060 osize += squote;
12061 else
12062 quote = '"';
12063 }
12064
12065 repr = PyUnicode_New(osize, max);
12066 if (repr == NULL)
12067 return NULL;
12068 okind = PyUnicode_KIND(repr);
12069 odata = PyUnicode_DATA(repr);
12070
12071 PyUnicode_WRITE(okind, odata, 0, quote);
12072 PyUnicode_WRITE(okind, odata, osize-1, quote);
12073
12074 for (i = 0, o = 1; i < isize; i++) {
12075 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012076
12077 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if ((ch == quote) || (ch == '\\')) {
12079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 continue;
12082 }
12083
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012085 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 PyUnicode_WRITE(okind, odata, o++, '\\');
12087 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012088 }
12089 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 PyUnicode_WRITE(okind, odata, o++, '\\');
12091 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012092 }
12093 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 PyUnicode_WRITE(okind, odata, o++, '\\');
12095 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012096 }
12097
12098 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012099 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 PyUnicode_WRITE(okind, odata, o++, '\\');
12101 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012104 }
12105
Georg Brandl559e5d72008-06-11 18:37:52 +000012106 /* Copy ASCII characters as-is */
12107 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012109 }
12110
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012112 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012114 (categories Z* and C* except ASCII space)
12115 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012117 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 if (ch <= 0xff) {
12119 PyUnicode_WRITE(okind, odata, o++, '\\');
12120 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012123 }
12124 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 else if (ch >= 0x10000) {
12126 PyUnicode_WRITE(okind, odata, o++, '\\');
12127 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12130 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12131 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012136 }
12137 /* Map 16-bit characters to '\uxxxx' */
12138 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 PyUnicode_WRITE(okind, odata, o++, '\\');
12140 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012141 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12142 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12143 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12144 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012145 }
12146 }
12147 /* Copy characters as-is */
12148 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012150 }
12151 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012154 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012155 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156}
12157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160\n\
12161Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012162such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163arguments start and end are interpreted as in slice notation.\n\
12164\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012165Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166
12167static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169{
Jesus Ceaac451502011-04-20 17:09:23 +020012170 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012171 Py_ssize_t start;
12172 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012173 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Jesus Ceaac451502011-04-20 17:09:23 +020012175 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12176 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (PyUnicode_READY(self) == -1)
12180 return NULL;
12181 if (PyUnicode_READY(substring) == -1)
12182 return NULL;
12183
Victor Stinner794d5672011-10-10 03:21:36 +020012184 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
12188 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (result == -2)
12191 return NULL;
12192
Christian Heimes217cfd12007-12-02 14:31:20 +000012193 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012196PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012197 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012199Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
12201static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
Jesus Ceaac451502011-04-20 17:09:23 +020012204 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012205 Py_ssize_t start;
12206 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012207 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
Jesus Ceaac451502011-04-20 17:09:23 +020012209 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12210 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (PyUnicode_READY(self) == -1)
12214 return NULL;
12215 if (PyUnicode_READY(substring) == -1)
12216 return NULL;
12217
Victor Stinner794d5672011-10-10 03:21:36 +020012218 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
12222 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (result == -2)
12225 return NULL;
12226
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 if (result < 0) {
12228 PyErr_SetString(PyExc_ValueError, "substring not found");
12229 return NULL;
12230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231
Christian Heimes217cfd12007-12-02 14:31:20 +000012232 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012235PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012238Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012239done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240
12241static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012242unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012244 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 Py_UCS4 fillchar = ' ';
12246
Victor Stinnere9a29352011-10-01 02:14:59 +020012247 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012249
Victor Stinnere9a29352011-10-01 02:14:59 +020012250 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 return NULL;
12252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254 Py_INCREF(self);
12255 return (PyObject*) self;
12256 }
12257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259}
12260
Alexander Belopolsky40018472011-02-26 01:02:56 +000012261PyObject *
12262PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263{
12264 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012265
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 s = PyUnicode_FromObject(s);
12267 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012268 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 if (sep != NULL) {
12270 sep = PyUnicode_FromObject(sep);
12271 if (sep == NULL) {
12272 Py_DECREF(s);
12273 return NULL;
12274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 }
12276
Victor Stinner9310abb2011-10-05 00:59:23 +020012277 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
12279 Py_DECREF(s);
12280 Py_XDECREF(sep);
12281 return result;
12282}
12283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012284PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286\n\
12287Return a list of the words in S, using sep as the\n\
12288delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012289splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012290whitespace string is a separator and empty strings are\n\
12291removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
12293static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012294unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295{
12296 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012297 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298
Martin v. Löwis18e16552006-02-15 17:27:45 +000012299 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 return NULL;
12301
12302 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012305 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308}
12309
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310PyObject *
12311PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12312{
12313 PyObject* str_obj;
12314 PyObject* sep_obj;
12315 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 int kind1, kind2, kind;
12317 void *buf1 = NULL, *buf2 = NULL;
12318 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012319
12320 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012321 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012325 Py_DECREF(str_obj);
12326 return NULL;
12327 }
12328
Victor Stinner14f8f022011-10-05 20:58:25 +020012329 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012331 kind = Py_MAX(kind1, kind2);
12332 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012334 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 if (!buf1)
12336 goto onError;
12337 buf2 = PyUnicode_DATA(sep_obj);
12338 if (kind2 != kind)
12339 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12340 if (!buf2)
12341 goto onError;
12342 len1 = PyUnicode_GET_LENGTH(str_obj);
12343 len2 = PyUnicode_GET_LENGTH(sep_obj);
12344
Victor Stinner14f8f022011-10-05 20:58:25 +020012345 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012347 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12348 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12349 else
12350 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 break;
12352 case PyUnicode_2BYTE_KIND:
12353 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12354 break;
12355 case PyUnicode_4BYTE_KIND:
12356 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12357 break;
12358 default:
12359 assert(0);
12360 out = 0;
12361 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362
12363 Py_DECREF(sep_obj);
12364 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 if (kind1 != kind)
12366 PyMem_Free(buf1);
12367 if (kind2 != kind)
12368 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012369
12370 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 onError:
12372 Py_DECREF(sep_obj);
12373 Py_DECREF(str_obj);
12374 if (kind1 != kind && buf1)
12375 PyMem_Free(buf1);
12376 if (kind2 != kind && buf2)
12377 PyMem_Free(buf2);
12378 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379}
12380
12381
12382PyObject *
12383PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12384{
12385 PyObject* str_obj;
12386 PyObject* sep_obj;
12387 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 int kind1, kind2, kind;
12389 void *buf1 = NULL, *buf2 = NULL;
12390 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012391
12392 str_obj = PyUnicode_FromObject(str_in);
12393 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395 sep_obj = PyUnicode_FromObject(sep_in);
12396 if (!sep_obj) {
12397 Py_DECREF(str_obj);
12398 return NULL;
12399 }
12400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 kind1 = PyUnicode_KIND(str_in);
12402 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012403 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 buf1 = PyUnicode_DATA(str_in);
12405 if (kind1 != kind)
12406 buf1 = _PyUnicode_AsKind(str_in, kind);
12407 if (!buf1)
12408 goto onError;
12409 buf2 = PyUnicode_DATA(sep_obj);
12410 if (kind2 != kind)
12411 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12412 if (!buf2)
12413 goto onError;
12414 len1 = PyUnicode_GET_LENGTH(str_obj);
12415 len2 = PyUnicode_GET_LENGTH(sep_obj);
12416
12417 switch(PyUnicode_KIND(str_in)) {
12418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012419 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12420 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12421 else
12422 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 break;
12424 case PyUnicode_2BYTE_KIND:
12425 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12426 break;
12427 case PyUnicode_4BYTE_KIND:
12428 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12429 break;
12430 default:
12431 assert(0);
12432 out = 0;
12433 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434
12435 Py_DECREF(sep_obj);
12436 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 if (kind1 != kind)
12438 PyMem_Free(buf1);
12439 if (kind2 != kind)
12440 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441
12442 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 onError:
12444 Py_DECREF(sep_obj);
12445 Py_DECREF(str_obj);
12446 if (kind1 != kind && buf1)
12447 PyMem_Free(buf1);
12448 if (kind2 != kind && buf2)
12449 PyMem_Free(buf2);
12450 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451}
12452
12453PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012455\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012456Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012458found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459
12460static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012461unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012462{
Victor Stinner9310abb2011-10-05 00:59:23 +020012463 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012464}
12465
12466PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012467 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012468\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012469Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012470the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012471separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012472
12473static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012474unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012475{
Victor Stinner9310abb2011-10-05 00:59:23 +020012476 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012477}
12478
Alexander Belopolsky40018472011-02-26 01:02:56 +000012479PyObject *
12480PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012481{
12482 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012483
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012484 s = PyUnicode_FromObject(s);
12485 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012486 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 if (sep != NULL) {
12488 sep = PyUnicode_FromObject(sep);
12489 if (sep == NULL) {
12490 Py_DECREF(s);
12491 return NULL;
12492 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012493 }
12494
Victor Stinner9310abb2011-10-05 00:59:23 +020012495 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012496
12497 Py_DECREF(s);
12498 Py_XDECREF(sep);
12499 return result;
12500}
12501
12502PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012504\n\
12505Return a list of the words in S, using sep as the\n\
12506delimiter string, starting at the end of the string and\n\
12507working to the front. If maxsplit is given, at most maxsplit\n\
12508splits are done. If sep is not specified, any whitespace string\n\
12509is a separator.");
12510
12511static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012512unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012513{
12514 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012515 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012516
Martin v. Löwis18e16552006-02-15 17:27:45 +000012517 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012518 return NULL;
12519
12520 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012522 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012523 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012524 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012525 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012526}
12527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012528PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530\n\
12531Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012532Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012533is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
12535static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012536unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012538 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012539 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012541 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12542 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543 return NULL;
12544
Guido van Rossum86662912000-04-11 15:38:46 +000012545 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
12548static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012549PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550{
Walter Dörwald346737f2007-05-31 10:44:43 +000012551 if (PyUnicode_CheckExact(self)) {
12552 Py_INCREF(self);
12553 return self;
12554 } else
12555 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012556 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557}
12558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012559PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561\n\
12562Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012563and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012566unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568 return fixup(self, fixswapcase);
12569}
12570
Georg Brandlceee0772007-11-27 23:48:05 +000012571PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012573\n\
12574Return a translation table usable for str.translate().\n\
12575If there is only one argument, it must be a dictionary mapping Unicode\n\
12576ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012577Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012578If there are two arguments, they must be strings of equal length, and\n\
12579in the resulting dictionary, each character in x will be mapped to the\n\
12580character at the same position in y. If there is a third argument, it\n\
12581must be a string, whose characters will be mapped to None in the result.");
12582
12583static PyObject*
12584unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12585{
12586 PyObject *x, *y = NULL, *z = NULL;
12587 PyObject *new = NULL, *key, *value;
12588 Py_ssize_t i = 0;
12589 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590
Georg Brandlceee0772007-11-27 23:48:05 +000012591 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12592 return NULL;
12593 new = PyDict_New();
12594 if (!new)
12595 return NULL;
12596 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 int x_kind, y_kind, z_kind;
12598 void *x_data, *y_data, *z_data;
12599
Georg Brandlceee0772007-11-27 23:48:05 +000012600 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012601 if (!PyUnicode_Check(x)) {
12602 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12603 "be a string if there is a second argument");
12604 goto err;
12605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012607 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12608 "arguments must have equal length");
12609 goto err;
12610 }
12611 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 x_kind = PyUnicode_KIND(x);
12613 y_kind = PyUnicode_KIND(y);
12614 x_data = PyUnicode_DATA(x);
12615 y_data = PyUnicode_DATA(y);
12616 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12617 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12618 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012619 if (!key || !value)
12620 goto err;
12621 res = PyDict_SetItem(new, key, value);
12622 Py_DECREF(key);
12623 Py_DECREF(value);
12624 if (res < 0)
12625 goto err;
12626 }
12627 /* create entries for deleting chars in z */
12628 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 z_kind = PyUnicode_KIND(z);
12630 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012631 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012633 if (!key)
12634 goto err;
12635 res = PyDict_SetItem(new, key, Py_None);
12636 Py_DECREF(key);
12637 if (res < 0)
12638 goto err;
12639 }
12640 }
12641 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 int kind;
12643 void *data;
12644
Georg Brandlceee0772007-11-27 23:48:05 +000012645 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012646 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012647 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12648 "to maketrans it must be a dict");
12649 goto err;
12650 }
12651 /* copy entries into the new dict, converting string keys to int keys */
12652 while (PyDict_Next(x, &i, &key, &value)) {
12653 if (PyUnicode_Check(key)) {
12654 /* convert string keys to integer keys */
12655 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012656 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012657 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12658 "table must be of length 1");
12659 goto err;
12660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 kind = PyUnicode_KIND(key);
12662 data = PyUnicode_DATA(key);
12663 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012664 if (!newkey)
12665 goto err;
12666 res = PyDict_SetItem(new, newkey, value);
12667 Py_DECREF(newkey);
12668 if (res < 0)
12669 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012670 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012671 /* just keep integer keys */
12672 if (PyDict_SetItem(new, key, value) < 0)
12673 goto err;
12674 } else {
12675 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12676 "be strings or integers");
12677 goto err;
12678 }
12679 }
12680 }
12681 return new;
12682 err:
12683 Py_DECREF(new);
12684 return NULL;
12685}
12686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012687PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689\n\
12690Return a copy of the string S, where all characters have been mapped\n\
12691through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012692Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012693Unmapped characters are left untouched. Characters mapped to None\n\
12694are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
12696static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700}
12701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012705Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
12707static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012708unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710 return fixup(self, fixupper);
12711}
12712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012713PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012716Pad a numeric string S with zeros on the left, to fill a field\n\
12717of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
12719static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012720unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012722 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012723 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012724 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 int kind;
12726 void *data;
12727 Py_UCS4 chr;
12728
12729 if (PyUnicode_READY(self) == -1)
12730 return NULL;
12731
Martin v. Löwis18e16552006-02-15 17:27:45 +000012732 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733 return NULL;
12734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012736 if (PyUnicode_CheckExact(self)) {
12737 Py_INCREF(self);
12738 return (PyObject*) self;
12739 }
12740 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012741 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742 }
12743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745
12746 u = pad(self, fill, 0, '0');
12747
Walter Dörwald068325e2002-04-15 13:36:47 +000012748 if (u == NULL)
12749 return NULL;
12750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 kind = PyUnicode_KIND(u);
12752 data = PyUnicode_DATA(u);
12753 chr = PyUnicode_READ(kind, data, fill);
12754
12755 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 PyUnicode_WRITE(kind, data, 0, chr);
12758 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759 }
12760
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012761 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762 return (PyObject*) u;
12763}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012766static PyObject *
12767unicode__decimal2ascii(PyObject *self)
12768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012770}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771#endif
12772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012773PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012776Return True if S starts with the specified prefix, False otherwise.\n\
12777With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778With optional end, stop comparing S at that position.\n\
12779prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
12781static PyObject *
12782unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012785 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012787 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012788 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012789 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790
Jesus Ceaac451502011-04-20 17:09:23 +020012791 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012793 if (PyTuple_Check(subobj)) {
12794 Py_ssize_t i;
12795 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12796 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 if (substring == NULL)
12799 return NULL;
12800 result = tailmatch(self, substring, start, end, -1);
12801 Py_DECREF(substring);
12802 if (result) {
12803 Py_RETURN_TRUE;
12804 }
12805 }
12806 /* nothing matched */
12807 Py_RETURN_FALSE;
12808 }
12809 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012810 if (substring == NULL) {
12811 if (PyErr_ExceptionMatches(PyExc_TypeError))
12812 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12813 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012815 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012816 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012818 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819}
12820
12821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012822PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012825Return True if S ends with the specified suffix, False otherwise.\n\
12826With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012827With optional end, stop comparing S at that position.\n\
12828suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
12830static PyObject *
12831unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012834 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012836 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012837 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012838 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839
Jesus Ceaac451502011-04-20 17:09:23 +020012840 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012842 if (PyTuple_Check(subobj)) {
12843 Py_ssize_t i;
12844 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12845 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012846 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012847 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012849 result = tailmatch(self, substring, start, end, +1);
12850 Py_DECREF(substring);
12851 if (result) {
12852 Py_RETURN_TRUE;
12853 }
12854 }
12855 Py_RETURN_FALSE;
12856 }
12857 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012858 if (substring == NULL) {
12859 if (PyErr_ExceptionMatches(PyExc_TypeError))
12860 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12861 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012863 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012864 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012866 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867}
12868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012870
12871PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012873\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012874Return a formatted version of S, using substitutions from args and kwargs.\n\
12875The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012876
Eric Smith27bbca62010-11-04 17:06:58 +000012877PyDoc_STRVAR(format_map__doc__,
12878 "S.format_map(mapping) -> str\n\
12879\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012880Return a formatted version of S, using substitutions from mapping.\n\
12881The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012882
Eric Smith4a7d76d2008-05-30 18:10:19 +000012883static PyObject *
12884unicode__format__(PyObject* self, PyObject* args)
12885{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012886 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012887
12888 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12889 return NULL;
12890
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012891 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012893 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012894}
12895
Eric Smith8c663262007-08-25 02:26:07 +000012896PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012898\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012899Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012900
12901static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012902unicode__sizeof__(PyUnicodeObject *v)
12903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 Py_ssize_t size;
12905
12906 /* If it's a compact object, account for base structure +
12907 character data. */
12908 if (PyUnicode_IS_COMPACT_ASCII(v))
12909 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12910 else if (PyUnicode_IS_COMPACT(v))
12911 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012912 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 else {
12914 /* If it is a two-block object, account for base object, and
12915 for character block if present. */
12916 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012917 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012919 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 }
12921 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012922 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012923 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012925 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012926 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927
12928 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012929}
12930
12931PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012933
12934static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012935unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012936{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012937 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 if (!copy)
12939 return NULL;
12940 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012941}
12942
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943static PyMethodDef unicode_methods[] = {
12944
12945 /* Order is according to common usage: often used methods should
12946 appear first, since lookup is done sequentially. */
12947
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012948 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012949 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12950 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012951 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12953 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12954 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12955 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12956 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12957 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12958 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012959 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012960 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12961 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12962 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012963 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012964 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12965 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12966 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012967 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012969 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012970 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012971 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12972 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12973 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12974 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12975 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12976 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12977 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12978 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12979 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12980 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12981 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12982 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12983 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12984 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012985 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012986 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012987 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012988 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012989 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012990 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012991 {"maketrans", (PyCFunction) unicode_maketrans,
12992 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012993 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012994#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012995 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996#endif
12997
12998#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012999 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013000 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001#endif
13002
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004 {NULL, NULL}
13005};
13006
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013007static PyObject *
13008unicode_mod(PyObject *v, PyObject *w)
13009{
Brian Curtindfc80e32011-08-10 20:28:54 -050013010 if (!PyUnicode_Check(v))
13011 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013013}
13014
13015static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 0, /*nb_add*/
13017 0, /*nb_subtract*/
13018 0, /*nb_multiply*/
13019 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013020};
13021
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013023 (lenfunc) unicode_length, /* sq_length */
13024 PyUnicode_Concat, /* sq_concat */
13025 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13026 (ssizeargfunc) unicode_getitem, /* sq_item */
13027 0, /* sq_slice */
13028 0, /* sq_ass_item */
13029 0, /* sq_ass_slice */
13030 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031};
13032
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013033static PyObject*
13034unicode_subscript(PyUnicodeObject* self, PyObject* item)
13035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 if (PyUnicode_READY(self) == -1)
13037 return NULL;
13038
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013039 if (PyIndex_Check(item)) {
13040 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013041 if (i == -1 && PyErr_Occurred())
13042 return NULL;
13043 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020013045 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013046 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013047 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013048 PyObject *result;
13049 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013050 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013051 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013055 return NULL;
13056 }
13057
13058 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 return PyUnicode_New(0, 0);
13060 } else if (start == 0 && step == 1 &&
13061 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013062 PyUnicode_CheckExact(self)) {
13063 Py_INCREF(self);
13064 return (PyObject *)self;
13065 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020013066 return PyUnicode_Substring((PyObject*)self,
13067 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013068 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013069 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013070 src_kind = PyUnicode_KIND(self);
13071 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013072 if (!PyUnicode_IS_ASCII(self)) {
13073 kind_limit = kind_maxchar_limit(src_kind);
13074 max_char = 0;
13075 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13076 ch = PyUnicode_READ(src_kind, src_data, cur);
13077 if (ch > max_char) {
13078 max_char = ch;
13079 if (max_char >= kind_limit)
13080 break;
13081 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013082 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013083 }
Victor Stinner55c99112011-10-13 01:17:06 +020013084 else
13085 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013086 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013087 if (result == NULL)
13088 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013089 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013090 dest_data = PyUnicode_DATA(result);
13091
13092 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013093 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13094 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013095 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013096 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013097 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013098 } else {
13099 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13100 return NULL;
13101 }
13102}
13103
13104static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 (lenfunc)unicode_length, /* mp_length */
13106 (binaryfunc)unicode_subscript, /* mp_subscript */
13107 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013108};
13109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111/* Helpers for PyUnicode_Format() */
13112
13113static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013114getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013116 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 (*p_argidx)++;
13119 if (arglen < 0)
13120 return args;
13121 else
13122 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123 }
13124 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126 return NULL;
13127}
13128
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013129/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013131static PyObject *
13132formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013134 char *p;
13135 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013137
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138 x = PyFloat_AsDouble(v);
13139 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013140 return NULL;
13141
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013144
Eric Smith0923d1d2009-04-16 20:16:10 +000013145 p = PyOS_double_to_string(x, type, prec,
13146 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013147 if (p == NULL)
13148 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013150 PyMem_Free(p);
13151 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152}
13153
Tim Peters38fd5b62000-09-21 05:43:11 +000013154static PyObject*
13155formatlong(PyObject *val, int flags, int prec, int type)
13156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 char *buf;
13158 int len;
13159 PyObject *str; /* temporary string object. */
13160 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013161
Benjamin Peterson14339b62009-01-31 16:36:08 +000013162 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13163 if (!str)
13164 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 Py_DECREF(str);
13167 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013168}
13169
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013170static Py_UCS4
13171formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013173 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013174 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013176 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 goto onError;
13179 }
13180 else {
13181 /* Integer input truncated to a character */
13182 long x;
13183 x = PyLong_AsLong(v);
13184 if (x == -1 && PyErr_Occurred())
13185 goto onError;
13186
13187 if (x < 0 || x > 0x10ffff) {
13188 PyErr_SetString(PyExc_OverflowError,
13189 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013190 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 }
13192
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013193 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013195
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013197 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013199 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200}
13201
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013202static int
13203repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13204{
13205 int r;
13206 assert(count > 0);
13207 assert(PyUnicode_Check(obj));
13208 if (count > 5) {
13209 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
13210 if (repeated == NULL)
13211 return -1;
13212 r = _PyAccu_Accumulate(acc, repeated);
13213 Py_DECREF(repeated);
13214 return r;
13215 }
13216 else {
13217 do {
13218 if (_PyAccu_Accumulate(acc, obj))
13219 return -1;
13220 } while (--count);
13221 return 0;
13222 }
13223}
13224
Alexander Belopolsky40018472011-02-26 01:02:56 +000013225PyObject *
13226PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 void *fmt;
13229 int fmtkind;
13230 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013232 int r;
13233 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013236 PyObject *temp = NULL;
13237 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013239 _PyAccu acc;
13240 static PyObject *plus, *minus, *blank, *zero, *percent;
13241
13242 if (!plus && !(plus = get_latin1_char('+')))
13243 return NULL;
13244 if (!minus && !(minus = get_latin1_char('-')))
13245 return NULL;
13246 if (!blank && !(blank = get_latin1_char(' ')))
13247 return NULL;
13248 if (!zero && !(zero = get_latin1_char('0')))
13249 return NULL;
13250 if (!percent && !(percent = get_latin1_char('%')))
13251 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013252
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013254 PyErr_BadInternalCall();
13255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
13258 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013260 if (_PyAccu_Init(&acc))
13261 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013262 fmt = PyUnicode_DATA(uformat);
13263 fmtkind = PyUnicode_KIND(uformat);
13264 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13265 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013268 arglen = PyTuple_Size(args);
13269 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270 }
13271 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 arglen = -1;
13273 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013275 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013276 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
13279 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013281 PyObject *nonfmt;
13282 Py_ssize_t nonfmtpos;
13283 nonfmtpos = fmtpos++;
13284 while (fmtcnt >= 0 &&
13285 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13286 fmtpos++;
13287 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013289 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
13290 if (nonfmt == NULL)
13291 goto onError;
13292 r = _PyAccu_Accumulate(&acc, nonfmt);
13293 Py_DECREF(nonfmt);
13294 if (r)
13295 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013296 }
13297 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 /* Got a format specifier */
13299 int flags = 0;
13300 Py_ssize_t width = -1;
13301 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013303 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 int isnumok;
13305 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013306 void *pbuf = NULL;
13307 Py_ssize_t pindex, len;
13308 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 fmtpos++;
13311 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13312 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 Py_ssize_t keylen;
13314 PyObject *key;
13315 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013316
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 if (dict == NULL) {
13318 PyErr_SetString(PyExc_TypeError,
13319 "format requires a mapping");
13320 goto onError;
13321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013324 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 /* Skip over balanced parentheses */
13326 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 if (fmtcnt < 0 || pcount > 0) {
13335 PyErr_SetString(PyExc_ValueError,
13336 "incomplete format key");
13337 goto onError;
13338 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020013339 key = PyUnicode_Substring((PyObject*)uformat,
13340 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 if (key == NULL)
13342 goto onError;
13343 if (args_owned) {
13344 Py_DECREF(args);
13345 args_owned = 0;
13346 }
13347 args = PyObject_GetItem(dict, key);
13348 Py_DECREF(key);
13349 if (args == NULL) {
13350 goto onError;
13351 }
13352 args_owned = 1;
13353 arglen = -1;
13354 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013355 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 case '-': flags |= F_LJUST; continue;
13359 case '+': flags |= F_SIGN; continue;
13360 case ' ': flags |= F_BLANK; continue;
13361 case '#': flags |= F_ALT; continue;
13362 case '0': flags |= F_ZERO; continue;
13363 }
13364 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013365 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 if (c == '*') {
13367 v = getnextarg(args, arglen, &argidx);
13368 if (v == NULL)
13369 goto onError;
13370 if (!PyLong_Check(v)) {
13371 PyErr_SetString(PyExc_TypeError,
13372 "* wants int");
13373 goto onError;
13374 }
13375 width = PyLong_AsLong(v);
13376 if (width == -1 && PyErr_Occurred())
13377 goto onError;
13378 if (width < 0) {
13379 flags |= F_LJUST;
13380 width = -width;
13381 }
13382 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 }
13385 else if (c >= '0' && c <= '9') {
13386 width = c - '0';
13387 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013388 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 if (c < '0' || c > '9')
13390 break;
13391 if ((width*10) / 10 != width) {
13392 PyErr_SetString(PyExc_ValueError,
13393 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013394 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 }
13396 width = width*10 + (c - '0');
13397 }
13398 }
13399 if (c == '.') {
13400 prec = 0;
13401 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013403 if (c == '*') {
13404 v = getnextarg(args, arglen, &argidx);
13405 if (v == NULL)
13406 goto onError;
13407 if (!PyLong_Check(v)) {
13408 PyErr_SetString(PyExc_TypeError,
13409 "* wants int");
13410 goto onError;
13411 }
13412 prec = PyLong_AsLong(v);
13413 if (prec == -1 && PyErr_Occurred())
13414 goto onError;
13415 if (prec < 0)
13416 prec = 0;
13417 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 }
13420 else if (c >= '0' && c <= '9') {
13421 prec = c - '0';
13422 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013423 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 if (c < '0' || c > '9')
13425 break;
13426 if ((prec*10) / 10 != prec) {
13427 PyErr_SetString(PyExc_ValueError,
13428 "prec too big");
13429 goto onError;
13430 }
13431 prec = prec*10 + (c - '0');
13432 }
13433 }
13434 } /* prec */
13435 if (fmtcnt >= 0) {
13436 if (c == 'h' || c == 'l' || c == 'L') {
13437 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013438 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 }
13440 }
13441 if (fmtcnt < 0) {
13442 PyErr_SetString(PyExc_ValueError,
13443 "incomplete format");
13444 goto onError;
13445 }
13446 if (c != '%') {
13447 v = getnextarg(args, arglen, &argidx);
13448 if (v == NULL)
13449 goto onError;
13450 }
13451 sign = 0;
13452 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013453 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 switch (c) {
13455
13456 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013457 _PyAccu_Accumulate(&acc, percent);
13458 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013459
13460 case 's':
13461 case 'r':
13462 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013463 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 temp = v;
13465 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 }
13467 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 if (c == 's')
13469 temp = PyObject_Str(v);
13470 else if (c == 'r')
13471 temp = PyObject_Repr(v);
13472 else
13473 temp = PyObject_ASCII(v);
13474 if (temp == NULL)
13475 goto onError;
13476 if (PyUnicode_Check(temp))
13477 /* nothing to do */;
13478 else {
13479 Py_DECREF(temp);
13480 PyErr_SetString(PyExc_TypeError,
13481 "%s argument has non-string str()");
13482 goto onError;
13483 }
13484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 if (PyUnicode_READY(temp) == -1) {
13486 Py_CLEAR(temp);
13487 goto onError;
13488 }
13489 pbuf = PyUnicode_DATA(temp);
13490 kind = PyUnicode_KIND(temp);
13491 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 if (prec >= 0 && len > prec)
13493 len = prec;
13494 break;
13495
13496 case 'i':
13497 case 'd':
13498 case 'u':
13499 case 'o':
13500 case 'x':
13501 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 isnumok = 0;
13503 if (PyNumber_Check(v)) {
13504 PyObject *iobj=NULL;
13505
13506 if (PyLong_Check(v)) {
13507 iobj = v;
13508 Py_INCREF(iobj);
13509 }
13510 else {
13511 iobj = PyNumber_Long(v);
13512 }
13513 if (iobj!=NULL) {
13514 if (PyLong_Check(iobj)) {
13515 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013516 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 Py_DECREF(iobj);
13518 if (!temp)
13519 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 if (PyUnicode_READY(temp) == -1) {
13521 Py_CLEAR(temp);
13522 goto onError;
13523 }
13524 pbuf = PyUnicode_DATA(temp);
13525 kind = PyUnicode_KIND(temp);
13526 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 sign = 1;
13528 }
13529 else {
13530 Py_DECREF(iobj);
13531 }
13532 }
13533 }
13534 if (!isnumok) {
13535 PyErr_Format(PyExc_TypeError,
13536 "%%%c format: a number is required, "
13537 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13538 goto onError;
13539 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013540 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013542 fillobj = zero;
13543 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 break;
13545
13546 case 'e':
13547 case 'E':
13548 case 'f':
13549 case 'F':
13550 case 'g':
13551 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013552 temp = formatfloat(v, flags, prec, c);
13553 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 if (PyUnicode_READY(temp) == -1) {
13556 Py_CLEAR(temp);
13557 goto onError;
13558 }
13559 pbuf = PyUnicode_DATA(temp);
13560 kind = PyUnicode_KIND(temp);
13561 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013563 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013565 fillobj = zero;
13566 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 break;
13568
13569 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 {
13571 Py_UCS4 ch = formatchar(v);
13572 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013574 temp = _PyUnicode_FromUCS4(&ch, 1);
13575 if (temp == NULL)
13576 goto onError;
13577 pbuf = PyUnicode_DATA(temp);
13578 kind = PyUnicode_KIND(temp);
13579 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013582
13583 default:
13584 PyErr_Format(PyExc_ValueError,
13585 "unsupported format character '%c' (0x%x) "
13586 "at index %zd",
13587 (31<=c && c<=126) ? (char)c : '?',
13588 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 goto onError;
13591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013592 /* pbuf is initialized here. */
13593 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13596 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 pindex++;
13599 }
13600 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13601 signobj = plus;
13602 len--;
13603 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 }
13605 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013606 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013608 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 else
13610 sign = 0;
13611 }
13612 if (width < len)
13613 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013615 if (fill != ' ') {
13616 assert(signobj != NULL);
13617 if (_PyAccu_Accumulate(&acc, signobj))
13618 goto onError;
13619 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 if (width > len)
13621 width--;
13622 }
13623 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013625 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013627 second = get_latin1_char(
13628 PyUnicode_READ(kind, pbuf, pindex + 1));
13629 pindex += 2;
13630 if (second == NULL ||
13631 _PyAccu_Accumulate(&acc, zero) ||
13632 _PyAccu_Accumulate(&acc, second))
13633 goto onError;
13634 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 width -= 2;
13637 if (width < 0)
13638 width = 0;
13639 len -= 2;
13640 }
13641 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013643 if (repeat_accumulate(&acc, fillobj, width - len))
13644 goto onError;
13645 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 }
13647 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013648 if (sign) {
13649 assert(signobj != NULL);
13650 if (_PyAccu_Accumulate(&acc, signobj))
13651 goto onError;
13652 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13655 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013656 second = get_latin1_char(
13657 PyUnicode_READ(kind, pbuf, pindex + 1));
13658 pindex += 2;
13659 if (second == NULL ||
13660 _PyAccu_Accumulate(&acc, zero) ||
13661 _PyAccu_Accumulate(&acc, second))
13662 goto onError;
13663 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013664 }
13665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013667 if (temp != NULL) {
13668 assert(pbuf == PyUnicode_DATA(temp));
13669 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 else {
13672 const char *p = (const char *) pbuf;
13673 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013674 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013675 v = PyUnicode_FromKindAndData(kind, p, len);
13676 }
13677 if (v == NULL)
13678 goto onError;
13679 r = _PyAccu_Accumulate(&acc, v);
13680 Py_DECREF(v);
13681 if (r)
13682 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013683 if (width > len && repeat_accumulate(&acc, blank, width - len))
13684 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 if (dict && (argidx < arglen) && c != '%') {
13686 PyErr_SetString(PyExc_TypeError,
13687 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 goto onError;
13689 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013691 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692 } /* until end */
13693 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 PyErr_SetString(PyExc_TypeError,
13695 "not all arguments converted during string formatting");
13696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697 }
13698
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013699 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702 }
13703 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013704 Py_XDECREF(temp);
13705 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706 return (PyObject *)result;
13707
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013710 Py_XDECREF(temp);
13711 Py_XDECREF(second);
13712 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013713 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013715 }
13716 return NULL;
13717}
13718
Jeremy Hylton938ace62002-07-17 16:30:39 +000013719static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013720unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13721
Tim Peters6d6c1a32001-08-02 04:15:00 +000013722static PyObject *
13723unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13724{
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013726 static char *kwlist[] = {"object", "encoding", "errors", 0};
13727 char *encoding = NULL;
13728 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013729
Benjamin Peterson14339b62009-01-31 16:36:08 +000013730 if (type != &PyUnicode_Type)
13731 return unicode_subtype_new(type, args, kwds);
13732 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013734 return NULL;
13735 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013736 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013737 if (encoding == NULL && errors == NULL)
13738 return PyObject_Str(x);
13739 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013741}
13742
Guido van Rossume023fe02001-08-30 03:12:59 +000013743static PyObject *
13744unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13745{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013746 PyUnicodeObject *unicode, *self;
13747 Py_ssize_t length, char_size;
13748 int share_wstr, share_utf8;
13749 unsigned int kind;
13750 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013751
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753
13754 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13755 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013756 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013757 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013758 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013759 return NULL;
13760
13761 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13762 if (self == NULL) {
13763 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013764 return NULL;
13765 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013766 kind = PyUnicode_KIND(unicode);
13767 length = PyUnicode_GET_LENGTH(unicode);
13768
13769 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013770#ifdef Py_DEBUG
13771 _PyUnicode_HASH(self) = -1;
13772#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013773 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013774#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013775 _PyUnicode_STATE(self).interned = 0;
13776 _PyUnicode_STATE(self).kind = kind;
13777 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013778 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013779 _PyUnicode_STATE(self).ready = 1;
13780 _PyUnicode_WSTR(self) = NULL;
13781 _PyUnicode_UTF8_LENGTH(self) = 0;
13782 _PyUnicode_UTF8(self) = NULL;
13783 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013784 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013785
13786 share_utf8 = 0;
13787 share_wstr = 0;
13788 if (kind == PyUnicode_1BYTE_KIND) {
13789 char_size = 1;
13790 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13791 share_utf8 = 1;
13792 }
13793 else if (kind == PyUnicode_2BYTE_KIND) {
13794 char_size = 2;
13795 if (sizeof(wchar_t) == 2)
13796 share_wstr = 1;
13797 }
13798 else {
13799 assert(kind == PyUnicode_4BYTE_KIND);
13800 char_size = 4;
13801 if (sizeof(wchar_t) == 4)
13802 share_wstr = 1;
13803 }
13804
13805 /* Ensure we won't overflow the length. */
13806 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13807 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013809 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013810 data = PyObject_MALLOC((length + 1) * char_size);
13811 if (data == NULL) {
13812 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 goto onError;
13814 }
13815
Victor Stinnerc3c74152011-10-02 20:39:55 +020013816 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013817 if (share_utf8) {
13818 _PyUnicode_UTF8_LENGTH(self) = length;
13819 _PyUnicode_UTF8(self) = data;
13820 }
13821 if (share_wstr) {
13822 _PyUnicode_WSTR_LENGTH(self) = length;
13823 _PyUnicode_WSTR(self) = (wchar_t *)data;
13824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013826 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013827 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013828 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013829 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013830#ifdef Py_DEBUG
13831 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13832#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013833 return (PyObject *)self;
13834
13835onError:
13836 Py_DECREF(unicode);
13837 Py_DECREF(self);
13838 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013839}
13840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013841PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013843\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013844Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013845encoding defaults to the current default string encoding.\n\
13846errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013847
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013848static PyObject *unicode_iter(PyObject *seq);
13849
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013851 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013852 "str", /* tp_name */
13853 sizeof(PyUnicodeObject), /* tp_size */
13854 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 (destructor)unicode_dealloc, /* tp_dealloc */
13857 0, /* tp_print */
13858 0, /* tp_getattr */
13859 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013860 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 unicode_repr, /* tp_repr */
13862 &unicode_as_number, /* tp_as_number */
13863 &unicode_as_sequence, /* tp_as_sequence */
13864 &unicode_as_mapping, /* tp_as_mapping */
13865 (hashfunc) unicode_hash, /* tp_hash*/
13866 0, /* tp_call*/
13867 (reprfunc) unicode_str, /* tp_str */
13868 PyObject_GenericGetAttr, /* tp_getattro */
13869 0, /* tp_setattro */
13870 0, /* tp_as_buffer */
13871 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013873 unicode_doc, /* tp_doc */
13874 0, /* tp_traverse */
13875 0, /* tp_clear */
13876 PyUnicode_RichCompare, /* tp_richcompare */
13877 0, /* tp_weaklistoffset */
13878 unicode_iter, /* tp_iter */
13879 0, /* tp_iternext */
13880 unicode_methods, /* tp_methods */
13881 0, /* tp_members */
13882 0, /* tp_getset */
13883 &PyBaseObject_Type, /* tp_base */
13884 0, /* tp_dict */
13885 0, /* tp_descr_get */
13886 0, /* tp_descr_set */
13887 0, /* tp_dictoffset */
13888 0, /* tp_init */
13889 0, /* tp_alloc */
13890 unicode_new, /* tp_new */
13891 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013892};
13893
13894/* Initialize the Unicode implementation */
13895
Victor Stinner3a50e702011-10-18 21:21:00 +020013896int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013898 int i;
13899
Thomas Wouters477c8d52006-05-27 19:21:47 +000013900 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013902 0x000A, /* LINE FEED */
13903 0x000D, /* CARRIAGE RETURN */
13904 0x001C, /* FILE SEPARATOR */
13905 0x001D, /* GROUP SEPARATOR */
13906 0x001E, /* RECORD SEPARATOR */
13907 0x0085, /* NEXT LINE */
13908 0x2028, /* LINE SEPARATOR */
13909 0x2029, /* PARAGRAPH SEPARATOR */
13910 };
13911
Fred Drakee4315f52000-05-09 19:53:39 +000013912 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013913 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013914 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013915 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013916 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013917
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013918 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013920 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013922
13923 /* initialize the linebreak bloom filter */
13924 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013926 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013927
13928 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013929
13930#ifdef HAVE_MBCS
13931 winver.dwOSVersionInfoSize = sizeof(winver);
13932 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13933 PyErr_SetFromWindowsErr(0);
13934 return -1;
13935 }
13936#endif
13937 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938}
13939
13940/* Finalize the Unicode implementation */
13941
Christian Heimesa156e092008-02-16 07:38:31 +000013942int
13943PyUnicode_ClearFreeList(void)
13944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013946}
13947
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948void
Thomas Wouters78890102000-07-22 19:25:51 +000013949_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013951 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013952
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013953 Py_XDECREF(unicode_empty);
13954 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013955
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013956 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 if (unicode_latin1[i]) {
13958 Py_DECREF(unicode_latin1[i]);
13959 unicode_latin1[i] = NULL;
13960 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013961 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013962 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013963 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013965
Walter Dörwald16807132007-05-25 13:52:07 +000013966void
13967PyUnicode_InternInPlace(PyObject **p)
13968{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13970 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013971#ifdef Py_DEBUG
13972 assert(s != NULL);
13973 assert(_PyUnicode_CHECK(s));
13974#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013975 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013976 return;
13977#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 /* If it's a subclass, we don't really know what putting
13979 it in the interned dict might do. */
13980 if (!PyUnicode_CheckExact(s))
13981 return;
13982 if (PyUnicode_CHECK_INTERNED(s))
13983 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013984 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013985 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013986 return;
13987 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013988 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 if (interned == NULL) {
13990 interned = PyDict_New();
13991 if (interned == NULL) {
13992 PyErr_Clear(); /* Don't leave an exception */
13993 return;
13994 }
13995 }
13996 /* It might be that the GetItem call fails even
13997 though the key is present in the dictionary,
13998 namely when this happens during a stack overflow. */
13999 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014002
Benjamin Peterson29060642009-01-31 22:14:21 +000014003 if (t) {
14004 Py_INCREF(t);
14005 Py_DECREF(*p);
14006 *p = t;
14007 return;
14008 }
Walter Dörwald16807132007-05-25 13:52:07 +000014009
Benjamin Peterson14339b62009-01-31 16:36:08 +000014010 PyThreadState_GET()->recursion_critical = 1;
14011 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
14012 PyErr_Clear();
14013 PyThreadState_GET()->recursion_critical = 0;
14014 return;
14015 }
14016 PyThreadState_GET()->recursion_critical = 0;
14017 /* The two references in interned are not counted by refcnt.
14018 The deallocator will take care of this */
14019 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014020 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014021}
14022
14023void
14024PyUnicode_InternImmortal(PyObject **p)
14025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014026 PyUnicodeObject *u = (PyUnicodeObject *)*p;
14027
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 PyUnicode_InternInPlace(p);
14029 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014030 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 Py_INCREF(*p);
14032 }
Walter Dörwald16807132007-05-25 13:52:07 +000014033}
14034
14035PyObject *
14036PyUnicode_InternFromString(const char *cp)
14037{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 PyObject *s = PyUnicode_FromString(cp);
14039 if (s == NULL)
14040 return NULL;
14041 PyUnicode_InternInPlace(&s);
14042 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014043}
14044
Alexander Belopolsky40018472011-02-26 01:02:56 +000014045void
14046_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014047{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014048 PyObject *keys;
14049 PyUnicodeObject *s;
14050 Py_ssize_t i, n;
14051 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014052
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 if (interned == NULL || !PyDict_Check(interned))
14054 return;
14055 keys = PyDict_Keys(interned);
14056 if (keys == NULL || !PyList_Check(keys)) {
14057 PyErr_Clear();
14058 return;
14059 }
Walter Dörwald16807132007-05-25 13:52:07 +000014060
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14062 detector, interned unicode strings are not forcibly deallocated;
14063 rather, we give them their stolen references back, and then clear
14064 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014065
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 n = PyList_GET_SIZE(keys);
14067 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014068 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014069 for (i = 0; i < n; i++) {
14070 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014071 if (PyUnicode_READY(s) == -1) {
14072 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014073 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014075 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014076 case SSTATE_NOT_INTERNED:
14077 /* XXX Shouldn't happen */
14078 break;
14079 case SSTATE_INTERNED_IMMORTAL:
14080 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014081 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 break;
14083 case SSTATE_INTERNED_MORTAL:
14084 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014085 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 break;
14087 default:
14088 Py_FatalError("Inconsistent interned string state.");
14089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014090 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 }
14092 fprintf(stderr, "total size of all interned strings: "
14093 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14094 "mortal/immortal\n", mortal_size, immortal_size);
14095 Py_DECREF(keys);
14096 PyDict_Clear(interned);
14097 Py_DECREF(interned);
14098 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014099}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014100
14101
14102/********************* Unicode Iterator **************************/
14103
14104typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 PyObject_HEAD
14106 Py_ssize_t it_index;
14107 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014108} unicodeiterobject;
14109
14110static void
14111unicodeiter_dealloc(unicodeiterobject *it)
14112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 _PyObject_GC_UNTRACK(it);
14114 Py_XDECREF(it->it_seq);
14115 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014116}
14117
14118static int
14119unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 Py_VISIT(it->it_seq);
14122 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014123}
14124
14125static PyObject *
14126unicodeiter_next(unicodeiterobject *it)
14127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014128 PyUnicodeObject *seq;
14129 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014130
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 assert(it != NULL);
14132 seq = it->it_seq;
14133 if (seq == NULL)
14134 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014135 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014137 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14138 int kind = PyUnicode_KIND(seq);
14139 void *data = PyUnicode_DATA(seq);
14140 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14141 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 if (item != NULL)
14143 ++it->it_index;
14144 return item;
14145 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014146
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 Py_DECREF(seq);
14148 it->it_seq = NULL;
14149 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014150}
14151
14152static PyObject *
14153unicodeiter_len(unicodeiterobject *it)
14154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 Py_ssize_t len = 0;
14156 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014157 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014159}
14160
14161PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14162
14163static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014165 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014167};
14168
14169PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14171 "str_iterator", /* tp_name */
14172 sizeof(unicodeiterobject), /* tp_basicsize */
14173 0, /* tp_itemsize */
14174 /* methods */
14175 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14176 0, /* tp_print */
14177 0, /* tp_getattr */
14178 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014179 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 0, /* tp_repr */
14181 0, /* tp_as_number */
14182 0, /* tp_as_sequence */
14183 0, /* tp_as_mapping */
14184 0, /* tp_hash */
14185 0, /* tp_call */
14186 0, /* tp_str */
14187 PyObject_GenericGetAttr, /* tp_getattro */
14188 0, /* tp_setattro */
14189 0, /* tp_as_buffer */
14190 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14191 0, /* tp_doc */
14192 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14193 0, /* tp_clear */
14194 0, /* tp_richcompare */
14195 0, /* tp_weaklistoffset */
14196 PyObject_SelfIter, /* tp_iter */
14197 (iternextfunc)unicodeiter_next, /* tp_iternext */
14198 unicodeiter_methods, /* tp_methods */
14199 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014200};
14201
14202static PyObject *
14203unicode_iter(PyObject *seq)
14204{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014206
Benjamin Peterson14339b62009-01-31 16:36:08 +000014207 if (!PyUnicode_Check(seq)) {
14208 PyErr_BadInternalCall();
14209 return NULL;
14210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014211 if (PyUnicode_READY(seq) == -1)
14212 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014213 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14214 if (it == NULL)
14215 return NULL;
14216 it->it_index = 0;
14217 Py_INCREF(seq);
14218 it->it_seq = (PyUnicodeObject *)seq;
14219 _PyObject_GC_TRACK(it);
14220 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014221}
14222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223#define UNIOP(x) Py_UNICODE_##x
14224#define UNIOP_t Py_UNICODE
14225#include "uniops.h"
14226#undef UNIOP
14227#undef UNIOP_t
14228#define UNIOP(x) Py_UCS4_##x
14229#define UNIOP_t Py_UCS4
14230#include "uniops.h"
14231#undef UNIOP
14232#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000014233
Victor Stinner71133ff2010-09-01 23:43:53 +000014234Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000014235PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000014236{
14237 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020014238 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000014239 Py_ssize_t size;
14240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014241 if (!PyUnicode_Check(unicode)) {
14242 PyErr_BadArgument();
14243 return NULL;
14244 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014245 u = PyUnicode_AsUnicode(object);
14246 if (u == NULL)
14247 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014248 /* Ensure we won't overflow the size. */
14249 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14250 PyErr_NoMemory();
14251 return NULL;
14252 }
14253 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
14254 size *= sizeof(Py_UNICODE);
14255 copy = PyMem_Malloc(size);
14256 if (copy == NULL) {
14257 PyErr_NoMemory();
14258 return NULL;
14259 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014260 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014261 return copy;
14262}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014263
Georg Brandl66c221e2010-10-14 07:04:07 +000014264/* A _string module, to export formatter_parser and formatter_field_name_split
14265 to the string.Formatter class implemented in Python. */
14266
14267static PyMethodDef _string_methods[] = {
14268 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14269 METH_O, PyDoc_STR("split the argument as a field name")},
14270 {"formatter_parser", (PyCFunction) formatter_parser,
14271 METH_O, PyDoc_STR("parse the argument as a format string")},
14272 {NULL, NULL}
14273};
14274
14275static struct PyModuleDef _string_module = {
14276 PyModuleDef_HEAD_INIT,
14277 "_string",
14278 PyDoc_STR("string helper module"),
14279 0,
14280 _string_methods,
14281 NULL,
14282 NULL,
14283 NULL,
14284 NULL
14285};
14286
14287PyMODINIT_FUNC
14288PyInit__string(void)
14289{
14290 return PyModule_Create(&_string_module);
14291}
14292
14293
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014294#ifdef __cplusplus
14295}
14296#endif