blob: ec4aed6e2df733d58630ccf1b7a201610c0d7930 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Victor Stinner3a50e702011-10-18 21:21:00 +0200432#ifdef HAVE_MBCS
433static OSVERSIONINFOEX winver;
434#endif
435
Thomas Wouters477c8d52006-05-27 19:21:47 +0000436/* --- Bloom Filters ----------------------------------------------------- */
437
438/* stuff to implement simple "bloom filters" for Unicode characters.
439 to keep things simple, we use a single bitmask, using the least 5
440 bits from each unicode characters as the bit index. */
441
442/* the linebreak mask is set up by Unicode_Init below */
443
Antoine Pitrouf068f942010-01-13 14:19:12 +0000444#if LONG_BIT >= 128
445#define BLOOM_WIDTH 128
446#elif LONG_BIT >= 64
447#define BLOOM_WIDTH 64
448#elif LONG_BIT >= 32
449#define BLOOM_WIDTH 32
450#else
451#error "LONG_BIT is smaller than 32"
452#endif
453
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454#define BLOOM_MASK unsigned long
455
456static BLOOM_MASK bloom_linebreak;
457
Antoine Pitrouf068f942010-01-13 14:19:12 +0000458#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
459#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Benjamin Peterson29060642009-01-31 22:14:21 +0000461#define BLOOM_LINEBREAK(ch) \
462 ((ch) < 128U ? ascii_linebreak[(ch)] : \
463 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000464
Alexander Belopolsky40018472011-02-26 01:02:56 +0000465Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467{
468 /* calculate simple bloom-style bitmask for a given unicode string */
469
Antoine Pitrouf068f942010-01-13 14:19:12 +0000470 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000471 Py_ssize_t i;
472
473 mask = 0;
474 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000476
477 return mask;
478}
479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480#define BLOOM_MEMBER(mask, chr, str) \
481 (BLOOM(mask, chr) \
482 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000483
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200484/* Compilation of templated routines */
485
486#include "stringlib/asciilib.h"
487#include "stringlib/fastsearch.h"
488#include "stringlib/partition.h"
489#include "stringlib/split.h"
490#include "stringlib/count.h"
491#include "stringlib/find.h"
492#include "stringlib/find_max_char.h"
493#include "stringlib/localeutil.h"
494#include "stringlib/undef.h"
495
496#include "stringlib/ucs1lib.h"
497#include "stringlib/fastsearch.h"
498#include "stringlib/partition.h"
499#include "stringlib/split.h"
500#include "stringlib/count.h"
501#include "stringlib/find.h"
502#include "stringlib/find_max_char.h"
503#include "stringlib/localeutil.h"
504#include "stringlib/undef.h"
505
506#include "stringlib/ucs2lib.h"
507#include "stringlib/fastsearch.h"
508#include "stringlib/partition.h"
509#include "stringlib/split.h"
510#include "stringlib/count.h"
511#include "stringlib/find.h"
512#include "stringlib/find_max_char.h"
513#include "stringlib/localeutil.h"
514#include "stringlib/undef.h"
515
516#include "stringlib/ucs4lib.h"
517#include "stringlib/fastsearch.h"
518#include "stringlib/partition.h"
519#include "stringlib/split.h"
520#include "stringlib/count.h"
521#include "stringlib/find.h"
522#include "stringlib/find_max_char.h"
523#include "stringlib/localeutil.h"
524#include "stringlib/undef.h"
525
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200526#include "stringlib/unicodedefs.h"
527#include "stringlib/fastsearch.h"
528#include "stringlib/count.h"
529#include "stringlib/find.h"
530
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531/* --- Unicode Object ----------------------------------------------------- */
532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200534fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
537 Py_ssize_t size, Py_UCS4 ch,
538 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200540 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
541
542 switch (kind) {
543 case PyUnicode_1BYTE_KIND:
544 {
545 Py_UCS1 ch1 = (Py_UCS1) ch;
546 if (ch1 == ch)
547 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
548 else
549 return -1;
550 }
551 case PyUnicode_2BYTE_KIND:
552 {
553 Py_UCS2 ch2 = (Py_UCS2) ch;
554 if (ch2 == ch)
555 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
556 else
557 return -1;
558 }
559 case PyUnicode_4BYTE_KIND:
560 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
561 default:
562 assert(0);
563 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565}
566
Victor Stinnerfe226c02011-10-03 03:52:20 +0200567static PyObject*
568resize_compact(PyObject *unicode, Py_ssize_t length)
569{
570 Py_ssize_t char_size;
571 Py_ssize_t struct_size;
572 Py_ssize_t new_size;
573 int share_wstr;
574
575 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200576 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 if (PyUnicode_IS_COMPACT_ASCII(unicode))
578 struct_size = sizeof(PyASCIIObject);
579 else
580 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200581 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582
583 _Py_DEC_REFTOTAL;
584 _Py_ForgetReference(unicode);
585
586 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
587 PyErr_NoMemory();
588 return NULL;
589 }
590 new_size = (struct_size + (length + 1) * char_size);
591
592 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
593 if (unicode == NULL) {
594 PyObject_Del(unicode);
595 PyErr_NoMemory();
596 return NULL;
597 }
598 _Py_NewReference(unicode);
599 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200600 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200602 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
603 _PyUnicode_WSTR_LENGTH(unicode) = length;
604 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200605 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
606 length, 0);
607 return unicode;
608}
609
Alexander Belopolsky40018472011-02-26 01:02:56 +0000610static int
Victor Stinner95663112011-10-04 01:03:50 +0200611resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612{
Victor Stinner95663112011-10-04 01:03:50 +0200613 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000616
Victor Stinner95663112011-10-04 01:03:50 +0200617 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618
619 if (PyUnicode_IS_READY(unicode)) {
620 Py_ssize_t char_size;
621 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200622 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200623 void *data;
624
625 data = _PyUnicode_DATA_ANY(unicode);
626 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200627 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
629 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200630 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
631 {
632 PyObject_DEL(_PyUnicode_UTF8(unicode));
633 _PyUnicode_UTF8(unicode) = NULL;
634 _PyUnicode_UTF8_LENGTH(unicode) = 0;
635 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636
637 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
638 PyErr_NoMemory();
639 return -1;
640 }
641 new_size = (length + 1) * char_size;
642
643 data = (PyObject *)PyObject_REALLOC(data, new_size);
644 if (data == NULL) {
645 PyErr_NoMemory();
646 return -1;
647 }
648 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200649 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200650 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_WSTR_LENGTH(unicode) = length;
652 }
653 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200654 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 _PyUnicode_UTF8_LENGTH(unicode) = length;
656 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 _PyUnicode_LENGTH(unicode) = length;
658 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200659 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200660 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 }
Victor Stinner95663112011-10-04 01:03:50 +0200664 assert(_PyUnicode_WSTR(unicode) != NULL);
665
666 /* check for integer overflow */
667 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
668 PyErr_NoMemory();
669 return -1;
670 }
671 wstr = _PyUnicode_WSTR(unicode);
672 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
673 if (!wstr) {
674 PyErr_NoMemory();
675 return -1;
676 }
677 _PyUnicode_WSTR(unicode) = wstr;
678 _PyUnicode_WSTR(unicode)[length] = 0;
679 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200680 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681 return 0;
682}
683
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684static PyObject*
685resize_copy(PyObject *unicode, Py_ssize_t length)
686{
687 Py_ssize_t copy_length;
688 if (PyUnicode_IS_COMPACT(unicode)) {
689 PyObject *copy;
690 assert(PyUnicode_IS_READY(unicode));
691
692 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
693 if (copy == NULL)
694 return NULL;
695
696 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200697 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200699 }
700 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200701 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(_PyUnicode_WSTR(unicode) != NULL);
703 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200704 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 if (w == NULL)
706 return NULL;
707 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
708 copy_length = Py_MIN(copy_length, length);
709 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
710 copy_length);
711 return (PyObject*)w;
712 }
713}
714
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000716 Ux0000 terminated; some code (e.g. new_identifier)
717 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000720 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721
722*/
723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200725static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#endif
727
Alexander Belopolsky40018472011-02-26 01:02:56 +0000728static PyUnicodeObject *
729_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730{
731 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000733
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 if (length == 0 && unicode_empty != NULL) {
736 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200737 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000738 }
739
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000740 /* Ensure we won't overflow the size. */
741 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
742 return (PyUnicodeObject *)PyErr_NoMemory();
743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 if (length < 0) {
745 PyErr_SetString(PyExc_SystemError,
746 "Negative size passed to _PyUnicode_New");
747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748 }
749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750#ifdef Py_DEBUG
751 ++unicode_old_new_calls;
752#endif
753
754 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
755 if (unicode == NULL)
756 return NULL;
757 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
758 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
759 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000760 PyErr_NoMemory();
761 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763
Jeremy Hyltond8082792003-09-16 19:41:39 +0000764 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000765 * the caller fails before initializing str -- unicode_resize()
766 * reads str[0], and the Keep-Alive optimization can keep memory
767 * allocated for str alive across a call to unicode_dealloc(unicode).
768 * We don't want unicode_resize to read uninitialized memory in
769 * that case.
770 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200771 _PyUnicode_WSTR(unicode)[0] = 0;
772 _PyUnicode_WSTR(unicode)[length] = 0;
773 _PyUnicode_WSTR_LENGTH(unicode) = length;
774 _PyUnicode_HASH(unicode) = -1;
775 _PyUnicode_STATE(unicode).interned = 0;
776 _PyUnicode_STATE(unicode).kind = 0;
777 _PyUnicode_STATE(unicode).compact = 0;
778 _PyUnicode_STATE(unicode).ready = 0;
779 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200780 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200781 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200782 _PyUnicode_UTF8(unicode) = NULL;
783 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner67072932011-10-18 22:10:14 +0200784 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000786
Benjamin Peterson29060642009-01-31 22:14:21 +0000787 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000788 /* XXX UNREF/NEWREF interface should be more symmetrical */
789 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000790 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000791 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793}
794
Victor Stinnerf42dc442011-10-02 23:33:16 +0200795static const char*
796unicode_kind_name(PyObject *unicode)
797{
Victor Stinner42dfd712011-10-03 14:41:45 +0200798 /* don't check consistency: unicode_kind_name() is called from
799 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200800 if (!PyUnicode_IS_COMPACT(unicode))
801 {
802 if (!PyUnicode_IS_READY(unicode))
803 return "wstr";
804 switch(PyUnicode_KIND(unicode))
805 {
806 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200807 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200808 return "legacy ascii";
809 else
810 return "legacy latin1";
811 case PyUnicode_2BYTE_KIND:
812 return "legacy UCS2";
813 case PyUnicode_4BYTE_KIND:
814 return "legacy UCS4";
815 default:
816 return "<legacy invalid kind>";
817 }
818 }
819 assert(PyUnicode_IS_READY(unicode));
820 switch(PyUnicode_KIND(unicode))
821 {
822 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 return "ascii";
825 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200826 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200827 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200828 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200829 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200830 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200831 default:
832 return "<invalid compact kind>";
833 }
834}
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200837static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
839/* Functions wrapping macros for use in debugger */
840char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200841 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842}
843
844void *_PyUnicode_compact_data(void *unicode) {
845 return _PyUnicode_COMPACT_DATA(unicode);
846}
847void *_PyUnicode_data(void *unicode){
848 printf("obj %p\n", unicode);
849 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
850 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
851 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
852 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
853 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
854 return PyUnicode_DATA(unicode);
855}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200856
857void
858_PyUnicode_Dump(PyObject *op)
859{
860 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200861 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
862 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
863 void *data;
864 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
865 if (ascii->state.compact)
866 data = (compact + 1);
867 else
868 data = unicode->data.any;
869 if (ascii->wstr == data)
870 printf("shared ");
871 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200872 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 printf(" (%zu), ", compact->wstr_length);
874 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
875 printf("shared ");
876 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200877 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200878 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200879}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880#endif
881
882PyObject *
883PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
884{
885 PyObject *obj;
886 PyCompactUnicodeObject *unicode;
887 void *data;
888 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 Py_ssize_t char_size;
891 Py_ssize_t struct_size;
892
893 /* Optimization for empty strings */
894 if (size == 0 && unicode_empty != NULL) {
895 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200896 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 }
898
899#ifdef Py_DEBUG
900 ++unicode_new_new_calls;
901#endif
902
Victor Stinner9e9d6892011-10-04 01:02:02 +0200903 is_ascii = 0;
904 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 struct_size = sizeof(PyCompactUnicodeObject);
906 if (maxchar < 128) {
907 kind_state = PyUnicode_1BYTE_KIND;
908 char_size = 1;
909 is_ascii = 1;
910 struct_size = sizeof(PyASCIIObject);
911 }
912 else if (maxchar < 256) {
913 kind_state = PyUnicode_1BYTE_KIND;
914 char_size = 1;
915 }
916 else if (maxchar < 65536) {
917 kind_state = PyUnicode_2BYTE_KIND;
918 char_size = 2;
919 if (sizeof(wchar_t) == 2)
920 is_sharing = 1;
921 }
922 else {
923 kind_state = PyUnicode_4BYTE_KIND;
924 char_size = 4;
925 if (sizeof(wchar_t) == 4)
926 is_sharing = 1;
927 }
928
929 /* Ensure we won't overflow the size. */
930 if (size < 0) {
931 PyErr_SetString(PyExc_SystemError,
932 "Negative size passed to PyUnicode_New");
933 return NULL;
934 }
935 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
936 return PyErr_NoMemory();
937
938 /* Duplicated allocation code from _PyObject_New() instead of a call to
939 * PyObject_New() so we are able to allocate space for the object and
940 * it's data buffer.
941 */
942 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
943 if (obj == NULL)
944 return PyErr_NoMemory();
945 obj = PyObject_INIT(obj, &PyUnicode_Type);
946 if (obj == NULL)
947 return NULL;
948
949 unicode = (PyCompactUnicodeObject *)obj;
950 if (is_ascii)
951 data = ((PyASCIIObject*)obj) + 1;
952 else
953 data = unicode + 1;
954 _PyUnicode_LENGTH(unicode) = size;
955 _PyUnicode_HASH(unicode) = -1;
956 _PyUnicode_STATE(unicode).interned = 0;
957 _PyUnicode_STATE(unicode).kind = kind_state;
958 _PyUnicode_STATE(unicode).compact = 1;
959 _PyUnicode_STATE(unicode).ready = 1;
960 _PyUnicode_STATE(unicode).ascii = is_ascii;
961 if (is_ascii) {
962 ((char*)data)[size] = 0;
963 _PyUnicode_WSTR(unicode) = NULL;
964 }
965 else if (kind_state == PyUnicode_1BYTE_KIND) {
966 ((char*)data)[size] = 0;
967 _PyUnicode_WSTR(unicode) = NULL;
968 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200970 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 }
972 else {
973 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 if (kind_state == PyUnicode_2BYTE_KIND)
976 ((Py_UCS2*)data)[size] = 0;
977 else /* kind_state == PyUnicode_4BYTE_KIND */
978 ((Py_UCS4*)data)[size] = 0;
979 if (is_sharing) {
980 _PyUnicode_WSTR_LENGTH(unicode) = size;
981 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
982 }
983 else {
984 _PyUnicode_WSTR_LENGTH(unicode) = 0;
985 _PyUnicode_WSTR(unicode) = NULL;
986 }
987 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200988 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 return obj;
990}
991
992#if SIZEOF_WCHAR_T == 2
993/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
994 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200995 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996
997 This function assumes that unicode can hold one more code point than wstr
998 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200999static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1001 PyUnicodeObject *unicode)
1002{
1003 const wchar_t *iter;
1004 Py_UCS4 *ucs4_out;
1005
Victor Stinner910337b2011-10-03 03:20:16 +02001006 assert(unicode != NULL);
1007 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1009 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1010
1011 for (iter = begin; iter < end; ) {
1012 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1013 _PyUnicode_GET_LENGTH(unicode)));
1014 if (*iter >= 0xD800 && *iter <= 0xDBFF
1015 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1016 {
1017 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1018 iter += 2;
1019 }
1020 else {
1021 *ucs4_out++ = *iter;
1022 iter++;
1023 }
1024 }
1025 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1026 _PyUnicode_GET_LENGTH(unicode)));
1027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028}
1029#endif
1030
Victor Stinnercd9950f2011-10-02 00:34:53 +02001031static int
1032_PyUnicode_Dirty(PyObject *unicode)
1033{
Victor Stinner910337b2011-10-03 03:20:16 +02001034 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001035 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001036 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001037 "Cannot modify a string having more than 1 reference");
1038 return -1;
1039 }
1040 _PyUnicode_DIRTY(unicode);
1041 return 0;
1042}
1043
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044static int
1045_copy_characters(PyObject *to, Py_ssize_t to_start,
1046 PyObject *from, Py_ssize_t from_start,
1047 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001049 unsigned int from_kind, to_kind;
1050 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001051 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 assert(PyUnicode_Check(from));
1054 assert(PyUnicode_Check(to));
1055 assert(PyUnicode_IS_READY(from));
1056 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001058 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1059 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1060 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001062 if (how_many == 0)
1063 return 0;
1064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001066 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001070#ifdef Py_DEBUG
1071 if (!check_maxchar
1072 && (from_kind > to_kind
1073 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001074 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001075 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1076 Py_UCS4 ch;
1077 Py_ssize_t i;
1078 for (i=0; i < how_many; i++) {
1079 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1080 assert(ch <= to_maxchar);
1081 }
1082 }
1083#endif
1084 fast = (from_kind == to_kind);
1085 if (check_maxchar
1086 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1087 {
1088 /* deny latin1 => ascii */
1089 fast = 0;
1090 }
1091
1092 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001093 Py_MEMCPY((char*)to_data + to_kind * to_start,
1094 (char*)from_data + from_kind * from_start,
1095 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001097 else if (from_kind == PyUnicode_1BYTE_KIND
1098 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001099 {
1100 _PyUnicode_CONVERT_BYTES(
1101 Py_UCS1, Py_UCS2,
1102 PyUnicode_1BYTE_DATA(from) + from_start,
1103 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1104 PyUnicode_2BYTE_DATA(to) + to_start
1105 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001106 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001107 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001108 && to_kind == PyUnicode_4BYTE_KIND)
1109 {
1110 _PyUnicode_CONVERT_BYTES(
1111 Py_UCS1, Py_UCS4,
1112 PyUnicode_1BYTE_DATA(from) + from_start,
1113 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1114 PyUnicode_4BYTE_DATA(to) + to_start
1115 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001116 }
1117 else if (from_kind == PyUnicode_2BYTE_KIND
1118 && to_kind == PyUnicode_4BYTE_KIND)
1119 {
1120 _PyUnicode_CONVERT_BYTES(
1121 Py_UCS2, Py_UCS4,
1122 PyUnicode_2BYTE_DATA(from) + from_start,
1123 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1124 PyUnicode_4BYTE_DATA(to) + to_start
1125 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001126 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 /* check if max_char(from substring) <= max_char(to) */
1129 if (from_kind > to_kind
1130 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001131 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001132 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001133 /* slow path to check for character overflow */
1134 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001136 Py_ssize_t i;
1137
Victor Stinner56c161a2011-10-06 02:47:11 +02001138#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001139 for (i=0; i < how_many; i++) {
1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001141 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1143 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001144#else
1145 if (!check_maxchar) {
1146 for (i=0; i < how_many; i++) {
1147 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1148 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1149 }
1150 }
1151 else {
1152 for (i=0; i < how_many; i++) {
1153 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1154 if (ch > to_maxchar)
1155 return 1;
1156 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1157 }
1158 }
1159#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001160 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001162 assert(0 && "inconsistent state");
1163 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 }
1165 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166 return 0;
1167}
1168
1169static void
1170copy_characters(PyObject *to, Py_ssize_t to_start,
1171 PyObject *from, Py_ssize_t from_start,
1172 Py_ssize_t how_many)
1173{
1174 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1175}
1176
1177Py_ssize_t
1178PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1179 PyObject *from, Py_ssize_t from_start,
1180 Py_ssize_t how_many)
1181{
1182 int err;
1183
1184 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1185 PyErr_BadInternalCall();
1186 return -1;
1187 }
1188
1189 if (PyUnicode_READY(from))
1190 return -1;
1191 if (PyUnicode_READY(to))
1192 return -1;
1193
1194 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1195 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1196 PyErr_Format(PyExc_SystemError,
1197 "Cannot write %zi characters at %zi "
1198 "in a string of %zi characters",
1199 how_many, to_start, PyUnicode_GET_LENGTH(to));
1200 return -1;
1201 }
1202
1203 if (how_many == 0)
1204 return 0;
1205
1206 if (_PyUnicode_Dirty(to))
1207 return -1;
1208
1209 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1210 if (err) {
1211 PyErr_Format(PyExc_SystemError,
1212 "Cannot copy %s characters "
1213 "into a string of %s characters",
1214 unicode_kind_name(from),
1215 unicode_kind_name(to));
1216 return -1;
1217 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219}
1220
Victor Stinner17222162011-09-28 22:15:37 +02001221/* Find the maximum code point and count the number of surrogate pairs so a
1222 correct string length can be computed before converting a string to UCS4.
1223 This function counts single surrogates as a character and not as a pair.
1224
1225 Return 0 on success, or -1 on error. */
1226static int
1227find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1228 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229{
1230 const wchar_t *iter;
1231
Victor Stinnerc53be962011-10-02 21:33:54 +02001232 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 *num_surrogates = 0;
1234 *maxchar = 0;
1235
1236 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001237 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001239#if SIZEOF_WCHAR_T != 2
1240 if (*maxchar >= 0x10000)
1241 return 0;
1242#endif
1243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244#if SIZEOF_WCHAR_T == 2
1245 if (*iter >= 0xD800 && *iter <= 0xDBFF
1246 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1247 {
1248 Py_UCS4 surrogate_val;
1249 surrogate_val = (((iter[0] & 0x3FF)<<10)
1250 | (iter[1] & 0x3FF)) + 0x10000;
1251 ++(*num_surrogates);
1252 if (surrogate_val > *maxchar)
1253 *maxchar = surrogate_val;
1254 iter += 2;
1255 }
1256 else
1257 iter++;
1258#else
1259 iter++;
1260#endif
1261 }
1262 return 0;
1263}
1264
1265#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001266static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267#endif
1268
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001269static int
1270unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001272 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 wchar_t *end;
1274 Py_UCS4 maxchar = 0;
1275 Py_ssize_t num_surrogates;
1276#if SIZEOF_WCHAR_T == 2
1277 Py_ssize_t length_wo_surrogates;
1278#endif
1279
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001280 assert(p_obj != NULL);
1281 unicode = (PyUnicodeObject *)*p_obj;
1282
Georg Brandl7597add2011-10-05 16:36:47 +02001283 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001284 strings were created using _PyObject_New() and where no canonical
1285 representation (the str field) has been set yet aka strings
1286 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001287 assert(_PyUnicode_CHECK(unicode));
1288 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001290 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001291 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001292 /* Actually, it should neither be interned nor be anything else: */
1293 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294
1295#ifdef Py_DEBUG
1296 ++unicode_ready_calls;
1297#endif
1298
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001299#ifdef Py_DEBUG
1300 assert(!replace || Py_REFCNT(unicode) == 1);
1301#else
1302 if (replace && Py_REFCNT(unicode) != 1)
1303 replace = 0;
1304#endif
1305 if (replace) {
1306 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1307 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1308 /* Optimization for empty strings */
1309 if (len == 0) {
1310 Py_INCREF(unicode_empty);
1311 Py_DECREF(*p_obj);
1312 *p_obj = unicode_empty;
1313 return 0;
1314 }
1315 if (len == 1 && wstr[0] < 256) {
1316 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1317 if (latin1_char == NULL)
1318 return -1;
1319 Py_DECREF(*p_obj);
1320 *p_obj = latin1_char;
1321 return 0;
1322 }
1323 }
1324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001326 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001327 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329
1330 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001331 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1332 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 PyErr_NoMemory();
1334 return -1;
1335 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001336 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_WSTR(unicode), end,
1338 PyUnicode_1BYTE_DATA(unicode));
1339 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1340 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1341 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1342 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001343 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001344 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001345 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001349 _PyUnicode_UTF8(unicode) = NULL;
1350 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
1352 PyObject_FREE(_PyUnicode_WSTR(unicode));
1353 _PyUnicode_WSTR(unicode) = NULL;
1354 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1355 }
1356 /* In this case we might have to convert down from 4-byte native
1357 wchar_t to 2-byte unicode. */
1358 else if (maxchar < 65536) {
1359 assert(num_surrogates == 0 &&
1360 "FindMaxCharAndNumSurrogatePairs() messed up");
1361
Victor Stinner506f5922011-09-28 22:34:18 +02001362#if SIZEOF_WCHAR_T == 2
1363 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001365 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1366 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1367 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001368 _PyUnicode_UTF8(unicode) = NULL;
1369 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001370#else
1371 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001372 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001373 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001375 PyErr_NoMemory();
1376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 }
Victor Stinner506f5922011-09-28 22:34:18 +02001378 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1379 _PyUnicode_WSTR(unicode), end,
1380 PyUnicode_2BYTE_DATA(unicode));
1381 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1382 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1383 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001384 _PyUnicode_UTF8(unicode) = NULL;
1385 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001386 PyObject_FREE(_PyUnicode_WSTR(unicode));
1387 _PyUnicode_WSTR(unicode) = NULL;
1388 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1389#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1392 else {
1393#if SIZEOF_WCHAR_T == 2
1394 /* in case the native representation is 2-bytes, we need to allocate a
1395 new normalized 4-byte version. */
1396 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1398 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 PyErr_NoMemory();
1400 return -1;
1401 }
1402 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1403 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001406 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1407 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001408 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 PyObject_FREE(_PyUnicode_WSTR(unicode));
1410 _PyUnicode_WSTR(unicode) = NULL;
1411 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1412#else
1413 assert(num_surrogates == 0);
1414
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1420#endif
1421 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1422 }
1423 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001424 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 return 0;
1426}
1427
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001428int
1429_PyUnicode_ReadyReplace(PyObject **op)
1430{
1431 return unicode_ready(op, 1);
1432}
1433
1434int
1435_PyUnicode_Ready(PyObject *op)
1436{
1437 return unicode_ready(&op, 0);
1438}
1439
Alexander Belopolsky40018472011-02-26 01:02:56 +00001440static void
1441unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442{
Walter Dörwald16807132007-05-25 13:52:07 +00001443 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 case SSTATE_NOT_INTERNED:
1445 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001446
Benjamin Peterson29060642009-01-31 22:14:21 +00001447 case SSTATE_INTERNED_MORTAL:
1448 /* revive dead object temporarily for DelItem */
1449 Py_REFCNT(unicode) = 3;
1450 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1451 Py_FatalError(
1452 "deletion of interned string failed");
1453 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001454
Benjamin Peterson29060642009-01-31 22:14:21 +00001455 case SSTATE_INTERNED_IMMORTAL:
1456 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001457
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 default:
1459 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001460 }
1461
Victor Stinner03490912011-10-03 23:45:12 +02001462 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001464 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466
1467 if (PyUnicode_IS_COMPACT(unicode)) {
1468 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
1470 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001471 if (_PyUnicode_DATA_ANY(unicode))
1472 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001473 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 }
1475}
1476
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001477#ifdef Py_DEBUG
1478static int
1479unicode_is_singleton(PyObject *unicode)
1480{
1481 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1482 if (unicode == unicode_empty)
1483 return 1;
1484 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1485 {
1486 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1487 if (ch < 256 && unicode_latin1[ch] == unicode)
1488 return 1;
1489 }
1490 return 0;
1491}
1492#endif
1493
Alexander Belopolsky40018472011-02-26 01:02:56 +00001494static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001495unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001496{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001497 if (Py_REFCNT(unicode) != 1)
1498 return 0;
1499 if (PyUnicode_CHECK_INTERNED(unicode))
1500 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001501#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001502 /* singleton refcount is greater than 1 */
1503 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001504#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001505 return 1;
1506}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001507
Victor Stinnerfe226c02011-10-03 03:52:20 +02001508static int
1509unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1510{
1511 PyObject *unicode;
1512 Py_ssize_t old_length;
1513
1514 assert(p_unicode != NULL);
1515 unicode = *p_unicode;
1516
1517 assert(unicode != NULL);
1518 assert(PyUnicode_Check(unicode));
1519 assert(0 <= length);
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001522 old_length = PyUnicode_WSTR_LENGTH(unicode);
1523 else
1524 old_length = PyUnicode_GET_LENGTH(unicode);
1525 if (old_length == length)
1526 return 0;
1527
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (!unicode_resizable(unicode)) {
1529 PyObject *copy = resize_copy(unicode, length);
1530 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 Py_DECREF(*p_unicode);
1533 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001535 }
1536
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 if (PyUnicode_IS_COMPACT(unicode)) {
1538 *p_unicode = resize_compact(unicode, length);
1539 if (*p_unicode == NULL)
1540 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001541 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001543 }
1544 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001545}
1546
Alexander Belopolsky40018472011-02-26 01:02:56 +00001547int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001548PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001549{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 PyObject *unicode;
1551 if (p_unicode == NULL) {
1552 PyErr_BadInternalCall();
1553 return -1;
1554 }
1555 unicode = *p_unicode;
1556 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1557 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1558 {
1559 PyErr_BadInternalCall();
1560 return -1;
1561 }
1562 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001563}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565static PyObject*
1566get_latin1_char(unsigned char ch)
1567{
Victor Stinnera464fc12011-10-02 20:39:30 +02001568 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001570 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 if (!unicode)
1572 return NULL;
1573 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001574 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 unicode_latin1[ch] = unicode;
1576 }
1577 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001578 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579}
1580
Alexander Belopolsky40018472011-02-26 01:02:56 +00001581PyObject *
1582PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583{
1584 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 Py_UCS4 maxchar = 0;
1586 Py_ssize_t num_surrogates;
1587
1588 if (u == NULL)
1589 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591 /* If the Unicode data is known at construction time, we can apply
1592 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 /* Optimization for empty strings */
1595 if (size == 0 && unicode_empty != NULL) {
1596 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001597 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598 }
Tim Petersced69f82003-09-16 20:30:58 +00001599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 /* Single character Unicode objects in the Latin-1 range are
1601 shared when using this constructor */
1602 if (size == 1 && *u < 256)
1603 return get_latin1_char((unsigned char)*u);
1604
1605 /* If not empty and not single character, copy the Unicode data
1606 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001607 if (find_maxchar_surrogates(u, u + size,
1608 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001609 return NULL;
1610
1611 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1612 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613 if (!unicode)
1614 return NULL;
1615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 switch (PyUnicode_KIND(unicode)) {
1617 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001618 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1620 break;
1621 case PyUnicode_2BYTE_KIND:
1622#if Py_UNICODE_SIZE == 2
1623 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1624#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001625 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1627#endif
1628 break;
1629 case PyUnicode_4BYTE_KIND:
1630#if SIZEOF_WCHAR_T == 2
1631 /* This is the only case which has to process surrogates, thus
1632 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001633 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634#else
1635 assert(num_surrogates == 0);
1636 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1637#endif
1638 break;
1639 default:
1640 assert(0 && "Impossible state");
1641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001643 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 return (PyObject *)unicode;
1645}
1646
Alexander Belopolsky40018472011-02-26 01:02:56 +00001647PyObject *
1648PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001649{
1650 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001651
Benjamin Peterson14339b62009-01-31 16:36:08 +00001652 if (size < 0) {
1653 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001655 return NULL;
1656 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001657
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001658 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001659 some optimizations which share commonly used objects.
1660 Also, this means the input must be UTF-8, so fall back to the
1661 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001662 if (u != NULL) {
1663
Benjamin Peterson29060642009-01-31 22:14:21 +00001664 /* Optimization for empty strings */
1665 if (size == 0 && unicode_empty != NULL) {
1666 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001667 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001669
1670 /* Single characters are shared when using this constructor.
1671 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 if (size == 1 && Py_CHARMASK(*u) < 128)
1673 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001674
1675 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001676 }
1677
Walter Dörwald55507312007-05-18 13:12:10 +00001678 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001679 if (!unicode)
1680 return NULL;
1681
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001682 return (PyObject *)unicode;
1683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685PyObject *
1686PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001687{
1688 size_t size = strlen(u);
1689 if (size > PY_SSIZE_T_MAX) {
1690 PyErr_SetString(PyExc_OverflowError, "input too long");
1691 return NULL;
1692 }
1693
1694 return PyUnicode_FromStringAndSize(u, size);
1695}
1696
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001697PyObject *
1698_PyUnicode_FromId(_Py_Identifier *id)
1699{
1700 if (!id->object) {
1701 id->object = PyUnicode_FromString(id->string);
1702 if (!id->object)
1703 return NULL;
1704 PyUnicode_InternInPlace(&id->object);
1705 assert(!id->next);
1706 id->next = static_strings;
1707 static_strings = id;
1708 }
1709 Py_INCREF(id->object);
1710 return id->object;
1711}
1712
1713void
1714_PyUnicode_ClearStaticStrings()
1715{
1716 _Py_Identifier *i;
1717 for (i = static_strings; i; i = i->next) {
1718 Py_DECREF(i->object);
1719 i->object = NULL;
1720 i->next = NULL;
1721 }
1722}
1723
Victor Stinnere57b1c02011-09-28 22:20:48 +02001724static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001725unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001726{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001727 PyObject *res;
1728#ifdef Py_DEBUG
1729 const unsigned char *p;
1730 const unsigned char *end = s + size;
1731 for (p=s; p < end; p++) {
1732 assert(*p < 128);
1733 }
1734#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001735 if (size == 1)
1736 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001737 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001738 if (!res)
1739 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001740 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001741 return res;
1742}
1743
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001744static Py_UCS4
1745kind_maxchar_limit(unsigned int kind)
1746{
1747 switch(kind) {
1748 case PyUnicode_1BYTE_KIND:
1749 return 0x80;
1750 case PyUnicode_2BYTE_KIND:
1751 return 0x100;
1752 case PyUnicode_4BYTE_KIND:
1753 return 0x10000;
1754 default:
1755 assert(0 && "invalid kind");
1756 return 0x10ffff;
1757 }
1758}
1759
Victor Stinner702c7342011-10-05 13:50:52 +02001760static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001761_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001764 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001765
1766 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001767 if (size == 1)
1768 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001769 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001770 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 if (!res)
1772 return NULL;
1773 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001774 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776}
1777
Victor Stinnere57b1c02011-09-28 22:20:48 +02001778static PyObject*
1779_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780{
1781 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001782 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783
1784 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001785 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001786 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001787 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001788 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 if (!res)
1790 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001791 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001793 else {
1794 _PyUnicode_CONVERT_BYTES(
1795 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1796 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001797 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 return res;
1799}
1800
Victor Stinnere57b1c02011-09-28 22:20:48 +02001801static PyObject*
1802_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001805 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806
1807 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001808 if (size == 1 && u[0] < 256)
1809 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001810 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001811 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 if (!res)
1813 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001814 if (max_char < 256)
1815 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1816 PyUnicode_1BYTE_DATA(res));
1817 else if (max_char < 0x10000)
1818 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1819 PyUnicode_2BYTE_DATA(res));
1820 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001822 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 return res;
1824}
1825
1826PyObject*
1827PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1828{
1829 switch(kind) {
1830 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001831 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001833 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001835 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001836 default:
1837 assert(0 && "invalid kind");
1838 PyErr_SetString(PyExc_SystemError, "invalid kind");
1839 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841}
1842
Victor Stinner25a4b292011-10-06 12:31:55 +02001843/* Ensure that a string uses the most efficient storage, if it is not the
1844 case: create a new string with of the right kind. Write NULL into *p_unicode
1845 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001846static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001847unicode_adjust_maxchar(PyObject **p_unicode)
1848{
1849 PyObject *unicode, *copy;
1850 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001851 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001852 unsigned int kind;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856 assert(PyUnicode_IS_READY(unicode));
1857 if (PyUnicode_IS_ASCII(unicode))
1858 return;
1859
1860 len = PyUnicode_GET_LENGTH(unicode);
1861 kind = PyUnicode_KIND(unicode);
1862 if (kind == PyUnicode_1BYTE_KIND) {
1863 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001864 max_char = ucs1lib_find_max_char(u, u + len);
1865 if (max_char >= 128)
1866 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001867 }
1868 else if (kind == PyUnicode_2BYTE_KIND) {
1869 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001870 max_char = ucs2lib_find_max_char(u, u + len);
1871 if (max_char >= 256)
1872 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001873 }
1874 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001875 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001876 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001877 max_char = ucs4lib_find_max_char(u, u + len);
1878 if (max_char >= 0x10000)
1879 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001880 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001881 copy = PyUnicode_New(len, max_char);
1882 copy_characters(copy, 0, unicode, 0, len);
1883 Py_DECREF(unicode);
1884 *p_unicode = copy;
1885}
1886
Victor Stinner034f6cf2011-09-30 02:26:44 +02001887PyObject*
1888PyUnicode_Copy(PyObject *unicode)
1889{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001890 Py_ssize_t size;
1891 PyObject *copy;
1892 void *data;
1893
Victor Stinner034f6cf2011-09-30 02:26:44 +02001894 if (!PyUnicode_Check(unicode)) {
1895 PyErr_BadInternalCall();
1896 return NULL;
1897 }
1898 if (PyUnicode_READY(unicode))
1899 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001900
1901 size = PyUnicode_GET_LENGTH(unicode);
1902 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1903 if (!copy)
1904 return NULL;
1905 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1906
1907 data = PyUnicode_DATA(unicode);
1908 switch (PyUnicode_KIND(unicode))
1909 {
1910 case PyUnicode_1BYTE_KIND:
1911 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1912 break;
1913 case PyUnicode_2BYTE_KIND:
1914 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1915 break;
1916 case PyUnicode_4BYTE_KIND:
1917 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1918 break;
1919 default:
1920 assert(0);
1921 break;
1922 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001923 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001924 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001925}
1926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927
Victor Stinnerbc603d12011-10-02 01:00:40 +02001928/* Widen Unicode objects to larger buffers. Don't write terminating null
1929 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
1931void*
1932_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1933{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001934 Py_ssize_t len;
1935 void *result;
1936 unsigned int skind;
1937
1938 if (PyUnicode_READY(s))
1939 return NULL;
1940
1941 len = PyUnicode_GET_LENGTH(s);
1942 skind = PyUnicode_KIND(s);
1943 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001944 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 return NULL;
1946 }
1947 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001948 case PyUnicode_2BYTE_KIND:
1949 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1950 if (!result)
1951 return PyErr_NoMemory();
1952 assert(skind == PyUnicode_1BYTE_KIND);
1953 _PyUnicode_CONVERT_BYTES(
1954 Py_UCS1, Py_UCS2,
1955 PyUnicode_1BYTE_DATA(s),
1956 PyUnicode_1BYTE_DATA(s) + len,
1957 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001959 case PyUnicode_4BYTE_KIND:
1960 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1961 if (!result)
1962 return PyErr_NoMemory();
1963 if (skind == PyUnicode_2BYTE_KIND) {
1964 _PyUnicode_CONVERT_BYTES(
1965 Py_UCS2, Py_UCS4,
1966 PyUnicode_2BYTE_DATA(s),
1967 PyUnicode_2BYTE_DATA(s) + len,
1968 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001970 else {
1971 assert(skind == PyUnicode_1BYTE_KIND);
1972 _PyUnicode_CONVERT_BYTES(
1973 Py_UCS1, Py_UCS4,
1974 PyUnicode_1BYTE_DATA(s),
1975 PyUnicode_1BYTE_DATA(s) + len,
1976 result);
1977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001979 default:
1980 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 }
Victor Stinner01698042011-10-04 00:04:26 +02001982 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return NULL;
1984}
1985
1986static Py_UCS4*
1987as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1988 int copy_null)
1989{
1990 int kind;
1991 void *data;
1992 Py_ssize_t len, targetlen;
1993 if (PyUnicode_READY(string) == -1)
1994 return NULL;
1995 kind = PyUnicode_KIND(string);
1996 data = PyUnicode_DATA(string);
1997 len = PyUnicode_GET_LENGTH(string);
1998 targetlen = len;
1999 if (copy_null)
2000 targetlen++;
2001 if (!target) {
2002 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2003 PyErr_NoMemory();
2004 return NULL;
2005 }
2006 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2007 if (!target) {
2008 PyErr_NoMemory();
2009 return NULL;
2010 }
2011 }
2012 else {
2013 if (targetsize < targetlen) {
2014 PyErr_Format(PyExc_SystemError,
2015 "string is longer than the buffer");
2016 if (copy_null && 0 < targetsize)
2017 target[0] = 0;
2018 return NULL;
2019 }
2020 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002021 if (kind == PyUnicode_1BYTE_KIND) {
2022 Py_UCS1 *start = (Py_UCS1 *) data;
2023 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002025 else if (kind == PyUnicode_2BYTE_KIND) {
2026 Py_UCS2 *start = (Py_UCS2 *) data;
2027 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2028 }
2029 else {
2030 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (copy_null)
2034 target[len] = 0;
2035 return target;
2036}
2037
2038Py_UCS4*
2039PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2040 int copy_null)
2041{
2042 if (target == NULL || targetsize < 1) {
2043 PyErr_BadInternalCall();
2044 return NULL;
2045 }
2046 return as_ucs4(string, target, targetsize, copy_null);
2047}
2048
2049Py_UCS4*
2050PyUnicode_AsUCS4Copy(PyObject *string)
2051{
2052 return as_ucs4(string, NULL, 0, 1);
2053}
2054
2055#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002056
Alexander Belopolsky40018472011-02-26 01:02:56 +00002057PyObject *
2058PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002061 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 PyErr_BadInternalCall();
2064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 }
2066
Martin v. Löwis790465f2008-04-05 20:41:37 +00002067 if (size == -1) {
2068 size = wcslen(w);
2069 }
2070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072}
2073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002075
Walter Dörwald346737f2007-05-31 10:44:43 +00002076static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002077makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2078 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 *fmt++ = '%';
2081 if (width) {
2082 if (zeropad)
2083 *fmt++ = '0';
2084 fmt += sprintf(fmt, "%d", width);
2085 }
2086 if (precision)
2087 fmt += sprintf(fmt, ".%d", precision);
2088 if (longflag)
2089 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002090 else if (longlongflag) {
2091 /* longlongflag should only ever be nonzero on machines with
2092 HAVE_LONG_LONG defined */
2093#ifdef HAVE_LONG_LONG
2094 char *f = PY_FORMAT_LONG_LONG;
2095 while (*f)
2096 *fmt++ = *f++;
2097#else
2098 /* we shouldn't ever get here */
2099 assert(0);
2100 *fmt++ = 'l';
2101#endif
2102 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 else if (size_tflag) {
2104 char *f = PY_FORMAT_SIZE_T;
2105 while (*f)
2106 *fmt++ = *f++;
2107 }
2108 *fmt++ = c;
2109 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002110}
2111
Victor Stinner96865452011-03-01 23:44:09 +00002112/* helper for PyUnicode_FromFormatV() */
2113
2114static const char*
2115parse_format_flags(const char *f,
2116 int *p_width, int *p_precision,
2117 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2118{
2119 int width, precision, longflag, longlongflag, size_tflag;
2120
2121 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2122 f++;
2123 width = 0;
2124 while (Py_ISDIGIT((unsigned)*f))
2125 width = (width*10) + *f++ - '0';
2126 precision = 0;
2127 if (*f == '.') {
2128 f++;
2129 while (Py_ISDIGIT((unsigned)*f))
2130 precision = (precision*10) + *f++ - '0';
2131 if (*f == '%') {
2132 /* "%.3%s" => f points to "3" */
2133 f--;
2134 }
2135 }
2136 if (*f == '\0') {
2137 /* bogus format "%.1" => go backward, f points to "1" */
2138 f--;
2139 }
2140 if (p_width != NULL)
2141 *p_width = width;
2142 if (p_precision != NULL)
2143 *p_precision = precision;
2144
2145 /* Handle %ld, %lu, %lld and %llu. */
2146 longflag = 0;
2147 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002148 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002149
2150 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002151 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002152 longflag = 1;
2153 ++f;
2154 }
2155#ifdef HAVE_LONG_LONG
2156 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002157 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002158 longlongflag = 1;
2159 f += 2;
2160 }
2161#endif
2162 }
2163 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002164 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002165 size_tflag = 1;
2166 ++f;
2167 }
2168 if (p_longflag != NULL)
2169 *p_longflag = longflag;
2170 if (p_longlongflag != NULL)
2171 *p_longlongflag = longlongflag;
2172 if (p_size_tflag != NULL)
2173 *p_size_tflag = size_tflag;
2174 return f;
2175}
2176
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002177/* maximum number of characters required for output of %ld. 21 characters
2178 allows for 64-bit integers (in decimal) and an optional sign. */
2179#define MAX_LONG_CHARS 21
2180/* maximum number of characters required for output of %lld.
2181 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2182 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2183#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2184
Walter Dörwaldd2034312007-05-18 16:29:38 +00002185PyObject *
2186PyUnicode_FromFormatV(const char *format, va_list vargs)
2187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 va_list count;
2189 Py_ssize_t callcount = 0;
2190 PyObject **callresults = NULL;
2191 PyObject **callresult = NULL;
2192 Py_ssize_t n = 0;
2193 int width = 0;
2194 int precision = 0;
2195 int zeropad;
2196 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002197 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002199 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2201 Py_UCS4 argmaxchar;
2202 Py_ssize_t numbersize = 0;
2203 char *numberresults = NULL;
2204 char *numberresult = NULL;
2205 Py_ssize_t i;
2206 int kind;
2207 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002208
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002209 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002210 /* step 1: count the number of %S/%R/%A/%s format specifications
2211 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2212 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002214 * also estimate a upper bound for all the number formats in the string,
2215 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 for (f = format; *f; f++) {
2218 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002219 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2221 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2222 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2223 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002226#ifdef HAVE_LONG_LONG
2227 if (longlongflag) {
2228 if (width < MAX_LONG_LONG_CHARS)
2229 width = MAX_LONG_LONG_CHARS;
2230 }
2231 else
2232#endif
2233 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2234 including sign. Decimal takes the most space. This
2235 isn't enough for octal. If a width is specified we
2236 need more (which we allocate later). */
2237 if (width < MAX_LONG_CHARS)
2238 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239
2240 /* account for the size + '\0' to separate numbers
2241 inside of the numberresults buffer */
2242 numbersize += (width + 1);
2243 }
2244 }
2245 else if ((unsigned char)*f > 127) {
2246 PyErr_Format(PyExc_ValueError,
2247 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2248 "string, got a non-ASCII byte: 0x%02x",
2249 (unsigned char)*f);
2250 return NULL;
2251 }
2252 }
2253 /* step 2: allocate memory for the results of
2254 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2255 if (callcount) {
2256 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2257 if (!callresults) {
2258 PyErr_NoMemory();
2259 return NULL;
2260 }
2261 callresult = callresults;
2262 }
2263 /* step 2.5: allocate memory for the results of formating numbers */
2264 if (numbersize) {
2265 numberresults = PyObject_Malloc(numbersize);
2266 if (!numberresults) {
2267 PyErr_NoMemory();
2268 goto fail;
2269 }
2270 numberresult = numberresults;
2271 }
2272
2273 /* step 3: format numbers and figure out how large a buffer we need */
2274 for (f = format; *f; f++) {
2275 if (*f == '%') {
2276 const char* p;
2277 int longflag;
2278 int longlongflag;
2279 int size_tflag;
2280 int numprinted;
2281
2282 p = f;
2283 zeropad = (f[1] == '0');
2284 f = parse_format_flags(f, &width, &precision,
2285 &longflag, &longlongflag, &size_tflag);
2286 switch (*f) {
2287 case 'c':
2288 {
2289 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002290 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 n++;
2292 break;
2293 }
2294 case '%':
2295 n++;
2296 break;
2297 case 'i':
2298 case 'd':
2299 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2300 width, precision, *f);
2301 if (longflag)
2302 numprinted = sprintf(numberresult, fmt,
2303 va_arg(count, long));
2304#ifdef HAVE_LONG_LONG
2305 else if (longlongflag)
2306 numprinted = sprintf(numberresult, fmt,
2307 va_arg(count, PY_LONG_LONG));
2308#endif
2309 else if (size_tflag)
2310 numprinted = sprintf(numberresult, fmt,
2311 va_arg(count, Py_ssize_t));
2312 else
2313 numprinted = sprintf(numberresult, fmt,
2314 va_arg(count, int));
2315 n += numprinted;
2316 /* advance by +1 to skip over the '\0' */
2317 numberresult += (numprinted + 1);
2318 assert(*(numberresult - 1) == '\0');
2319 assert(*(numberresult - 2) != '\0');
2320 assert(numprinted >= 0);
2321 assert(numberresult <= numberresults + numbersize);
2322 break;
2323 case 'u':
2324 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2325 width, precision, 'u');
2326 if (longflag)
2327 numprinted = sprintf(numberresult, fmt,
2328 va_arg(count, unsigned long));
2329#ifdef HAVE_LONG_LONG
2330 else if (longlongflag)
2331 numprinted = sprintf(numberresult, fmt,
2332 va_arg(count, unsigned PY_LONG_LONG));
2333#endif
2334 else if (size_tflag)
2335 numprinted = sprintf(numberresult, fmt,
2336 va_arg(count, size_t));
2337 else
2338 numprinted = sprintf(numberresult, fmt,
2339 va_arg(count, unsigned int));
2340 n += numprinted;
2341 numberresult += (numprinted + 1);
2342 assert(*(numberresult - 1) == '\0');
2343 assert(*(numberresult - 2) != '\0');
2344 assert(numprinted >= 0);
2345 assert(numberresult <= numberresults + numbersize);
2346 break;
2347 case 'x':
2348 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2349 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2350 n += numprinted;
2351 numberresult += (numprinted + 1);
2352 assert(*(numberresult - 1) == '\0');
2353 assert(*(numberresult - 2) != '\0');
2354 assert(numprinted >= 0);
2355 assert(numberresult <= numberresults + numbersize);
2356 break;
2357 case 'p':
2358 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2359 /* %p is ill-defined: ensure leading 0x. */
2360 if (numberresult[1] == 'X')
2361 numberresult[1] = 'x';
2362 else if (numberresult[1] != 'x') {
2363 memmove(numberresult + 2, numberresult,
2364 strlen(numberresult) + 1);
2365 numberresult[0] = '0';
2366 numberresult[1] = 'x';
2367 numprinted += 2;
2368 }
2369 n += numprinted;
2370 numberresult += (numprinted + 1);
2371 assert(*(numberresult - 1) == '\0');
2372 assert(*(numberresult - 2) != '\0');
2373 assert(numprinted >= 0);
2374 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 break;
2376 case 's':
2377 {
2378 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002379 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002380 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2381 if (!str)
2382 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 /* since PyUnicode_DecodeUTF8 returns already flexible
2384 unicode objects, there is no need to call ready on them */
2385 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002386 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002388 /* Remember the str and switch to the next slot */
2389 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 break;
2391 }
2392 case 'U':
2393 {
2394 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002395 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 if (PyUnicode_READY(obj) == -1)
2397 goto fail;
2398 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002399 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 break;
2402 }
2403 case 'V':
2404 {
2405 PyObject *obj = va_arg(count, PyObject *);
2406 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002407 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002409 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002410 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 if (PyUnicode_READY(obj) == -1)
2412 goto fail;
2413 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002414 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002416 *callresult++ = NULL;
2417 }
2418 else {
2419 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2420 if (!str_obj)
2421 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002422 if (PyUnicode_READY(str_obj)) {
2423 Py_DECREF(str_obj);
2424 goto fail;
2425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002427 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 *callresult++ = str_obj;
2430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002431 break;
2432 }
2433 case 'S':
2434 {
2435 PyObject *obj = va_arg(count, PyObject *);
2436 PyObject *str;
2437 assert(obj);
2438 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 /* Remember the str and switch to the next slot */
2445 *callresult++ = str;
2446 break;
2447 }
2448 case 'R':
2449 {
2450 PyObject *obj = va_arg(count, PyObject *);
2451 PyObject *repr;
2452 assert(obj);
2453 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002457 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 /* Remember the repr and switch to the next slot */
2460 *callresult++ = repr;
2461 break;
2462 }
2463 case 'A':
2464 {
2465 PyObject *obj = va_arg(count, PyObject *);
2466 PyObject *ascii;
2467 assert(obj);
2468 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002472 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 /* Remember the repr and switch to the next slot */
2475 *callresult++ = ascii;
2476 break;
2477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 default:
2479 /* if we stumble upon an unknown
2480 formatting code, copy the rest of
2481 the format string to the output
2482 string. (we cannot just skip the
2483 code, since there's no way to know
2484 what's in the argument list) */
2485 n += strlen(p);
2486 goto expand;
2487 }
2488 } else
2489 n++;
2490 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002491 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 we don't have to resize the string.
2495 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002496 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 if (!string)
2498 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 kind = PyUnicode_KIND(string);
2500 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002506 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002507
2508 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2510 /* checking for == because the last argument could be a empty
2511 string, which causes i to point to end, the assert at the end of
2512 the loop */
2513 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002514
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 switch (*f) {
2516 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002517 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 const int ordinal = va_arg(vargs, int);
2519 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002521 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002522 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 case 'p':
2527 /* unused, since we already have the result */
2528 if (*f == 'p')
2529 (void) va_arg(vargs, void *);
2530 else
2531 (void) va_arg(vargs, int);
2532 /* extract the result from numberresults and append. */
2533 for (; *numberresult; ++i, ++numberresult)
2534 PyUnicode_WRITE(kind, data, i, *numberresult);
2535 /* skip over the separating '\0' */
2536 assert(*numberresult == '\0');
2537 numberresult++;
2538 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
2540 case 's':
2541 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002542 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002544 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 size = PyUnicode_GET_LENGTH(*callresult);
2546 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002547 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002549 /* We're done with the unicode()/repr() => forget it */
2550 Py_DECREF(*callresult);
2551 /* switch to next unicode()/repr() result */
2552 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 break;
2554 }
2555 case 'U':
2556 {
2557 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 Py_ssize_t size;
2559 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2560 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002561 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 break;
2564 }
2565 case 'V':
2566 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002569 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 size = PyUnicode_GET_LENGTH(obj);
2572 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002573 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 size = PyUnicode_GET_LENGTH(*callresult);
2577 assert(PyUnicode_KIND(*callresult) <=
2578 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002579 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002581 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002583 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
2585 }
2586 case 'S':
2587 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002588 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002590 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 /* unused, since we already have the result */
2592 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002594 copy_characters(string, i, *callresult, 0, size);
2595 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 /* We're done with the unicode()/repr() => forget it */
2597 Py_DECREF(*callresult);
2598 /* switch to next unicode()/repr() result */
2599 ++callresult;
2600 break;
2601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 break;
2605 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 for (; *p; ++p, ++i)
2607 PyUnicode_WRITE(kind, data, i, *p);
2608 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 goto end;
2610 }
Victor Stinner1205f272010-09-11 00:54:47 +00002611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 else {
2613 assert(i < PyUnicode_GET_LENGTH(string));
2614 PyUnicode_WRITE(kind, data, i++, *f);
2615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618
Benjamin Peterson29060642009-01-31 22:14:21 +00002619 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 if (callresults)
2621 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 if (numberresults)
2623 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002624 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 if (callresults) {
2628 PyObject **callresult2 = callresults;
2629 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002630 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 ++callresult2;
2632 }
2633 PyObject_Free(callresults);
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 if (numberresults)
2636 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002638}
2639
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640PyObject *
2641PyUnicode_FromFormat(const char *format, ...)
2642{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 PyObject* ret;
2644 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002645
2646#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002648#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002650#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 ret = PyUnicode_FromFormatV(format, vargs);
2652 va_end(vargs);
2653 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002654}
2655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656#ifdef HAVE_WCHAR_H
2657
Victor Stinner5593d8a2010-10-02 11:11:27 +00002658/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2659 convert a Unicode object to a wide character string.
2660
Victor Stinnerd88d9832011-09-06 02:00:05 +02002661 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002662 character) required to convert the unicode object. Ignore size argument.
2663
Victor Stinnerd88d9832011-09-06 02:00:05 +02002664 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002665 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002666 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002667static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002668unicode_aswidechar(PyUnicodeObject *unicode,
2669 wchar_t *w,
2670 Py_ssize_t size)
2671{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002672 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 const wchar_t *wstr;
2674
2675 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2676 if (wstr == NULL)
2677 return -1;
2678
Victor Stinner5593d8a2010-10-02 11:11:27 +00002679 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680 if (size > res)
2681 size = res + 1;
2682 else
2683 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002685 return res;
2686 }
2687 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002689}
2690
2691Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002692PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002693 wchar_t *w,
2694 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695{
2696 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002697 PyErr_BadInternalCall();
2698 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002700 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701}
2702
Victor Stinner137c34c2010-09-29 10:25:54 +00002703wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002704PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002705 Py_ssize_t *size)
2706{
2707 wchar_t* buffer;
2708 Py_ssize_t buflen;
2709
2710 if (unicode == NULL) {
2711 PyErr_BadInternalCall();
2712 return NULL;
2713 }
2714
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002715 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 if (buflen == -1)
2717 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002719 PyErr_NoMemory();
2720 return NULL;
2721 }
2722
Victor Stinner137c34c2010-09-29 10:25:54 +00002723 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2724 if (buffer == NULL) {
2725 PyErr_NoMemory();
2726 return NULL;
2727 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002728 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (buflen == -1)
2730 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size != NULL)
2732 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002733 return buffer;
2734}
2735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
Alexander Belopolsky40018472011-02-26 01:02:56 +00002738PyObject *
2739PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002742 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002743 PyErr_SetString(PyExc_ValueError,
2744 "chr() arg not in range(0x110000)");
2745 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002746 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 if (ordinal < 256)
2749 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 v = PyUnicode_New(1, ordinal);
2752 if (v == NULL)
2753 return NULL;
2754 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002755 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002757}
2758
Alexander Belopolsky40018472011-02-26 01:02:56 +00002759PyObject *
2760PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002762 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002764 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002765 if (PyUnicode_READY(obj))
2766 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 Py_INCREF(obj);
2768 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002769 }
2770 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 /* For a Unicode subtype that's not a Unicode object,
2772 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002773 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002774 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002775 PyErr_Format(PyExc_TypeError,
2776 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002777 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002778 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002779}
2780
Alexander Belopolsky40018472011-02-26 01:02:56 +00002781PyObject *
2782PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002783 const char *encoding,
2784 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002785{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002786 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002787 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002788
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 PyErr_BadInternalCall();
2791 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002793
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002794 /* Decoding bytes objects is the most common case and should be fast */
2795 if (PyBytes_Check(obj)) {
2796 if (PyBytes_GET_SIZE(obj) == 0) {
2797 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002798 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002799 }
2800 else {
2801 v = PyUnicode_Decode(
2802 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2803 encoding, errors);
2804 }
2805 return v;
2806 }
2807
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 PyErr_SetString(PyExc_TypeError,
2810 "decoding str is not supported");
2811 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002814 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2815 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2816 PyErr_Format(PyExc_TypeError,
2817 "coercing to str: need bytes, bytearray "
2818 "or buffer-like object, %.80s found",
2819 Py_TYPE(obj)->tp_name);
2820 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002821 }
Tim Petersced69f82003-09-16 20:30:58 +00002822
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002825 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 }
Tim Petersced69f82003-09-16 20:30:58 +00002827 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002829
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002830 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002831 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832}
2833
Victor Stinner600d3be2010-06-10 12:00:55 +00002834/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002835 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2836 1 on success. */
2837static int
2838normalize_encoding(const char *encoding,
2839 char *lower,
2840 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002842 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002843 char *l;
2844 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002846 if (encoding == NULL) {
2847 strcpy(lower, "utf-8");
2848 return 1;
2849 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002850 e = encoding;
2851 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002852 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002853 while (*e) {
2854 if (l == l_end)
2855 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002856 if (Py_ISUPPER(*e)) {
2857 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002858 }
2859 else if (*e == '_') {
2860 *l++ = '-';
2861 e++;
2862 }
2863 else {
2864 *l++ = *e++;
2865 }
2866 }
2867 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002868 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002869}
2870
Alexander Belopolsky40018472011-02-26 01:02:56 +00002871PyObject *
2872PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002873 Py_ssize_t size,
2874 const char *encoding,
2875 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002876{
2877 PyObject *buffer = NULL, *unicode;
2878 Py_buffer info;
2879 char lower[11]; /* Enough for any encoding shortcut */
2880
Fred Drakee4315f52000-05-09 19:53:39 +00002881 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002882 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002883 if ((strcmp(lower, "utf-8") == 0) ||
2884 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002885 return PyUnicode_DecodeUTF8(s, size, errors);
2886 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002887 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002888 (strcmp(lower, "iso-8859-1") == 0))
2889 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002890#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002891 else if (strcmp(lower, "mbcs") == 0)
2892 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002893#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002894 else if (strcmp(lower, "ascii") == 0)
2895 return PyUnicode_DecodeASCII(s, size, errors);
2896 else if (strcmp(lower, "utf-16") == 0)
2897 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2898 else if (strcmp(lower, "utf-32") == 0)
2899 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
2902 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002903 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002904 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002905 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002906 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 if (buffer == NULL)
2908 goto onError;
2909 unicode = PyCodec_Decode(buffer, encoding, errors);
2910 if (unicode == NULL)
2911 goto onError;
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002914 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002915 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 Py_DECREF(unicode);
2917 goto onError;
2918 }
2919 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002920#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002921 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 Py_DECREF(unicode);
2923 return NULL;
2924 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002925#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002928
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 Py_XDECREF(buffer);
2931 return NULL;
2932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 const char *encoding,
2937 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002938{
2939 PyObject *v;
2940
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_BadArgument();
2943 goto onError;
2944 }
2945
2946 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002948
2949 /* Decode via the codec registry */
2950 v = PyCodec_Decode(unicode, encoding, errors);
2951 if (v == NULL)
2952 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002953 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002954 return v;
2955
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002957 return NULL;
2958}
2959
Alexander Belopolsky40018472011-02-26 01:02:56 +00002960PyObject *
2961PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002962 const char *encoding,
2963 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002964{
2965 PyObject *v;
2966
2967 if (!PyUnicode_Check(unicode)) {
2968 PyErr_BadArgument();
2969 goto onError;
2970 }
2971
2972 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002974
2975 /* Decode via the codec registry */
2976 v = PyCodec_Decode(unicode, encoding, errors);
2977 if (v == NULL)
2978 goto onError;
2979 if (!PyUnicode_Check(v)) {
2980 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002981 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002982 Py_TYPE(v)->tp_name);
2983 Py_DECREF(v);
2984 goto onError;
2985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002986 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002987 return v;
2988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002990 return NULL;
2991}
2992
Alexander Belopolsky40018472011-02-26 01:02:56 +00002993PyObject *
2994PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002995 Py_ssize_t size,
2996 const char *encoding,
2997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998{
2999 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003000
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 unicode = PyUnicode_FromUnicode(s, size);
3002 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3005 Py_DECREF(unicode);
3006 return v;
3007}
3008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003011 const char *encoding,
3012 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003013{
3014 PyObject *v;
3015
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 goto onError;
3019 }
3020
3021 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003023
3024 /* Encode via the codec registry */
3025 v = PyCodec_Encode(unicode, encoding, errors);
3026 if (v == NULL)
3027 goto onError;
3028 return v;
3029
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003031 return NULL;
3032}
3033
Victor Stinnerad158722010-10-27 00:25:46 +00003034PyObject *
3035PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003036{
Victor Stinner99b95382011-07-04 14:23:54 +02003037#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3039 PyUnicode_GET_SIZE(unicode),
3040 NULL);
3041#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003042 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003043#else
Victor Stinner793b5312011-04-27 00:24:21 +02003044 PyInterpreterState *interp = PyThreadState_GET()->interp;
3045 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3046 cannot use it to encode and decode filenames before it is loaded. Load
3047 the Python codec requires to encode at least its own filename. Use the C
3048 version of the locale codec until the codec registry is initialized and
3049 the Python codec is loaded.
3050
3051 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3052 cannot only rely on it: check also interp->fscodec_initialized for
3053 subinterpreters. */
3054 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003055 return PyUnicode_AsEncodedString(unicode,
3056 Py_FileSystemDefaultEncoding,
3057 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003058 }
3059 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003060 /* locale encoding with surrogateescape */
3061 wchar_t *wchar;
3062 char *bytes;
3063 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003064 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003065
3066 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3067 if (wchar == NULL)
3068 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003069 bytes = _Py_wchar2char(wchar, &error_pos);
3070 if (bytes == NULL) {
3071 if (error_pos != (size_t)-1) {
3072 char *errmsg = strerror(errno);
3073 PyObject *exc = NULL;
3074 if (errmsg == NULL)
3075 errmsg = "Py_wchar2char() failed";
3076 raise_encode_exception(&exc,
3077 "filesystemencoding",
3078 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3079 error_pos, error_pos+1,
3080 errmsg);
3081 Py_XDECREF(exc);
3082 }
3083 else
3084 PyErr_NoMemory();
3085 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003086 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003087 }
3088 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003089
3090 bytes_obj = PyBytes_FromString(bytes);
3091 PyMem_Free(bytes);
3092 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003093 }
Victor Stinnerad158722010-10-27 00:25:46 +00003094#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101{
3102 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003103 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003104
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 }
Fred Drakee4315f52000-05-09 19:53:39 +00003109
Fred Drakee4315f52000-05-09 19:53:39 +00003110 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003111 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003112 if ((strcmp(lower, "utf-8") == 0) ||
3113 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003114 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003115 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003116 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003117 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003119 }
Victor Stinner37296e82010-06-10 13:36:23 +00003120 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003121 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003122 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003124#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003125 else if (strcmp(lower, "mbcs") == 0)
3126 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3127 PyUnicode_GET_SIZE(unicode),
3128 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003129#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003130 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003131 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133
3134 /* Encode via the codec registry */
3135 v = PyCodec_Encode(unicode, encoding, errors);
3136 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003137 return NULL;
3138
3139 /* The normal path */
3140 if (PyBytes_Check(v))
3141 return v;
3142
3143 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003144 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003145 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003146 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003147
3148 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3149 "encoder %s returned bytearray instead of bytes",
3150 encoding);
3151 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003152 Py_DECREF(v);
3153 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003154 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003155
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003156 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3157 Py_DECREF(v);
3158 return b;
3159 }
3160
3161 PyErr_Format(PyExc_TypeError,
3162 "encoder did not return a bytes object (type=%.400s)",
3163 Py_TYPE(v)->tp_name);
3164 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003165 return NULL;
3166}
3167
Alexander Belopolsky40018472011-02-26 01:02:56 +00003168PyObject *
3169PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003170 const char *encoding,
3171 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172{
3173 PyObject *v;
3174
3175 if (!PyUnicode_Check(unicode)) {
3176 PyErr_BadArgument();
3177 goto onError;
3178 }
3179
3180 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003182
3183 /* Encode via the codec registry */
3184 v = PyCodec_Encode(unicode, encoding, errors);
3185 if (v == NULL)
3186 goto onError;
3187 if (!PyUnicode_Check(v)) {
3188 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003189 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190 Py_TYPE(v)->tp_name);
3191 Py_DECREF(v);
3192 goto onError;
3193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003195
Benjamin Peterson29060642009-01-31 22:14:21 +00003196 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 return NULL;
3198}
3199
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003200PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003201PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003202 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003203 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3204}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003205
Christian Heimes5894ba72007-11-04 11:43:14 +00003206PyObject*
3207PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3208{
Victor Stinner99b95382011-07-04 14:23:54 +02003209#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003210 return PyUnicode_DecodeMBCS(s, size, NULL);
3211#elif defined(__APPLE__)
3212 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3213#else
Victor Stinner793b5312011-04-27 00:24:21 +02003214 PyInterpreterState *interp = PyThreadState_GET()->interp;
3215 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3216 cannot use it to encode and decode filenames before it is loaded. Load
3217 the Python codec requires to encode at least its own filename. Use the C
3218 version of the locale codec until the codec registry is initialized and
3219 the Python codec is loaded.
3220
3221 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3222 cannot only rely on it: check also interp->fscodec_initialized for
3223 subinterpreters. */
3224 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003225 return PyUnicode_Decode(s, size,
3226 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003227 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003228 }
3229 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003230 /* locale encoding with surrogateescape */
3231 wchar_t *wchar;
3232 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003233 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003234
3235 if (s[size] != '\0' || size != strlen(s)) {
3236 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3237 return NULL;
3238 }
3239
Victor Stinner168e1172010-10-16 23:16:16 +00003240 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003241 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003242 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003243
Victor Stinner168e1172010-10-16 23:16:16 +00003244 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003245 PyMem_Free(wchar);
3246 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003247 }
Victor Stinnerad158722010-10-27 00:25:46 +00003248#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003249}
3250
Martin v. Löwis011e8422009-05-05 04:43:17 +00003251
3252int
3253PyUnicode_FSConverter(PyObject* arg, void* addr)
3254{
3255 PyObject *output = NULL;
3256 Py_ssize_t size;
3257 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003258 if (arg == NULL) {
3259 Py_DECREF(*(PyObject**)addr);
3260 return 1;
3261 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003262 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003263 output = arg;
3264 Py_INCREF(output);
3265 }
3266 else {
3267 arg = PyUnicode_FromObject(arg);
3268 if (!arg)
3269 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003270 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003271 Py_DECREF(arg);
3272 if (!output)
3273 return 0;
3274 if (!PyBytes_Check(output)) {
3275 Py_DECREF(output);
3276 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3277 return 0;
3278 }
3279 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003280 size = PyBytes_GET_SIZE(output);
3281 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003282 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003283 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003284 Py_DECREF(output);
3285 return 0;
3286 }
3287 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003288 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003289}
3290
3291
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003292int
3293PyUnicode_FSDecoder(PyObject* arg, void* addr)
3294{
3295 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003296 if (arg == NULL) {
3297 Py_DECREF(*(PyObject**)addr);
3298 return 1;
3299 }
3300 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 if (PyUnicode_READY(arg))
3302 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003303 output = arg;
3304 Py_INCREF(output);
3305 }
3306 else {
3307 arg = PyBytes_FromObject(arg);
3308 if (!arg)
3309 return 0;
3310 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3311 PyBytes_GET_SIZE(arg));
3312 Py_DECREF(arg);
3313 if (!output)
3314 return 0;
3315 if (!PyUnicode_Check(output)) {
3316 Py_DECREF(output);
3317 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3318 return 0;
3319 }
3320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003321 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003322 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003323 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3324 Py_DECREF(output);
3325 return 0;
3326 }
3327 *(PyObject**)addr = output;
3328 return Py_CLEANUP_SUPPORTED;
3329}
3330
3331
Martin v. Löwis5b222132007-06-10 09:51:05 +00003332char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003333PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003334{
Christian Heimesf3863112007-11-22 07:46:41 +00003335 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3337
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003338 if (!PyUnicode_Check(unicode)) {
3339 PyErr_BadArgument();
3340 return NULL;
3341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003342 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003343 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003345 if (PyUnicode_UTF8(unicode) == NULL) {
3346 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003347 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3348 if (bytes == NULL)
3349 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003350 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3351 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 Py_DECREF(bytes);
3353 return NULL;
3354 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003355 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3356 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 Py_DECREF(bytes);
3358 }
3359
3360 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003361 *psize = PyUnicode_UTF8_LENGTH(unicode);
3362 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003363}
3364
3365char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003366PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003368 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3369}
3370
3371#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003372static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003373#endif
3374
3375
3376Py_UNICODE *
3377PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3378{
3379 PyUnicodeObject *u;
3380 const unsigned char *one_byte;
3381#if SIZEOF_WCHAR_T == 4
3382 const Py_UCS2 *two_bytes;
3383#else
3384 const Py_UCS4 *four_bytes;
3385 const Py_UCS4 *ucs4_end;
3386 Py_ssize_t num_surrogates;
3387#endif
3388 wchar_t *w;
3389 wchar_t *wchar_end;
3390
3391 if (!PyUnicode_Check(unicode)) {
3392 PyErr_BadArgument();
3393 return NULL;
3394 }
3395 u = (PyUnicodeObject*)unicode;
3396 if (_PyUnicode_WSTR(u) == NULL) {
3397 /* Non-ASCII compact unicode object */
3398 assert(_PyUnicode_KIND(u) != 0);
3399 assert(PyUnicode_IS_READY(u));
3400
3401#ifdef Py_DEBUG
3402 ++unicode_as_unicode_calls;
3403#endif
3404
3405 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3406#if SIZEOF_WCHAR_T == 2
3407 four_bytes = PyUnicode_4BYTE_DATA(u);
3408 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3409 num_surrogates = 0;
3410
3411 for (; four_bytes < ucs4_end; ++four_bytes) {
3412 if (*four_bytes > 0xFFFF)
3413 ++num_surrogates;
3414 }
3415
3416 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3417 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3418 if (!_PyUnicode_WSTR(u)) {
3419 PyErr_NoMemory();
3420 return NULL;
3421 }
3422 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3423
3424 w = _PyUnicode_WSTR(u);
3425 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3426 four_bytes = PyUnicode_4BYTE_DATA(u);
3427 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3428 if (*four_bytes > 0xFFFF) {
3429 /* encode surrogate pair in this case */
3430 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3431 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3432 }
3433 else
3434 *w = *four_bytes;
3435
3436 if (w > wchar_end) {
3437 assert(0 && "Miscalculated string end");
3438 }
3439 }
3440 *w = 0;
3441#else
3442 /* sizeof(wchar_t) == 4 */
3443 Py_FatalError("Impossible unicode object state, wstr and str "
3444 "should share memory already.");
3445 return NULL;
3446#endif
3447 }
3448 else {
3449 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3450 (_PyUnicode_LENGTH(u) + 1));
3451 if (!_PyUnicode_WSTR(u)) {
3452 PyErr_NoMemory();
3453 return NULL;
3454 }
3455 if (!PyUnicode_IS_COMPACT_ASCII(u))
3456 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3457 w = _PyUnicode_WSTR(u);
3458 wchar_end = w + _PyUnicode_LENGTH(u);
3459
3460 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3461 one_byte = PyUnicode_1BYTE_DATA(u);
3462 for (; w < wchar_end; ++one_byte, ++w)
3463 *w = *one_byte;
3464 /* null-terminate the wstr */
3465 *w = 0;
3466 }
3467 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3468#if SIZEOF_WCHAR_T == 4
3469 two_bytes = PyUnicode_2BYTE_DATA(u);
3470 for (; w < wchar_end; ++two_bytes, ++w)
3471 *w = *two_bytes;
3472 /* null-terminate the wstr */
3473 *w = 0;
3474#else
3475 /* sizeof(wchar_t) == 2 */
3476 PyObject_FREE(_PyUnicode_WSTR(u));
3477 _PyUnicode_WSTR(u) = NULL;
3478 Py_FatalError("Impossible unicode object state, wstr "
3479 "and str should share memory already.");
3480 return NULL;
3481#endif
3482 }
3483 else {
3484 assert(0 && "This should never happen.");
3485 }
3486 }
3487 }
3488 if (size != NULL)
3489 *size = PyUnicode_WSTR_LENGTH(u);
3490 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003491}
3492
Alexander Belopolsky40018472011-02-26 01:02:56 +00003493Py_UNICODE *
3494PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003496 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003497}
3498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003499
Alexander Belopolsky40018472011-02-26 01:02:56 +00003500Py_ssize_t
3501PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502{
3503 if (!PyUnicode_Check(unicode)) {
3504 PyErr_BadArgument();
3505 goto onError;
3506 }
3507 return PyUnicode_GET_SIZE(unicode);
3508
Benjamin Peterson29060642009-01-31 22:14:21 +00003509 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510 return -1;
3511}
3512
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003513Py_ssize_t
3514PyUnicode_GetLength(PyObject *unicode)
3515{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003516 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517 PyErr_BadArgument();
3518 return -1;
3519 }
3520
3521 return PyUnicode_GET_LENGTH(unicode);
3522}
3523
3524Py_UCS4
3525PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3526{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003527 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3528 PyErr_BadArgument();
3529 return (Py_UCS4)-1;
3530 }
3531 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3532 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003533 return (Py_UCS4)-1;
3534 }
3535 return PyUnicode_READ_CHAR(unicode, index);
3536}
3537
3538int
3539PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3540{
3541 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003542 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003543 return -1;
3544 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003545 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3546 PyErr_SetString(PyExc_IndexError, "string index out of range");
3547 return -1;
3548 }
3549 if (_PyUnicode_Dirty(unicode))
3550 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3552 index, ch);
3553 return 0;
3554}
3555
Alexander Belopolsky40018472011-02-26 01:02:56 +00003556const char *
3557PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003558{
Victor Stinner42cb4622010-09-01 19:39:01 +00003559 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003560}
3561
Victor Stinner554f3f02010-06-16 23:33:54 +00003562/* create or adjust a UnicodeDecodeError */
3563static void
3564make_decode_exception(PyObject **exceptionObject,
3565 const char *encoding,
3566 const char *input, Py_ssize_t length,
3567 Py_ssize_t startpos, Py_ssize_t endpos,
3568 const char *reason)
3569{
3570 if (*exceptionObject == NULL) {
3571 *exceptionObject = PyUnicodeDecodeError_Create(
3572 encoding, input, length, startpos, endpos, reason);
3573 }
3574 else {
3575 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3576 goto onError;
3577 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3578 goto onError;
3579 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3580 goto onError;
3581 }
3582 return;
3583
3584onError:
3585 Py_DECREF(*exceptionObject);
3586 *exceptionObject = NULL;
3587}
3588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589/* error handling callback helper:
3590 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003591 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003592 and adjust various state variables.
3593 return 0 on success, -1 on error
3594*/
3595
Alexander Belopolsky40018472011-02-26 01:02:56 +00003596static int
3597unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003598 const char *encoding, const char *reason,
3599 const char **input, const char **inend, Py_ssize_t *startinpos,
3600 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3601 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003603 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003604
3605 PyObject *restuple = NULL;
3606 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003607 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003608 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003609 Py_ssize_t requiredsize;
3610 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003612 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003613 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003614 int res = -1;
3615
3616 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003617 *errorHandler = PyCodec_LookupError(errors);
3618 if (*errorHandler == NULL)
3619 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003620 }
3621
Victor Stinner554f3f02010-06-16 23:33:54 +00003622 make_decode_exception(exceptionObject,
3623 encoding,
3624 *input, *inend - *input,
3625 *startinpos, *endinpos,
3626 reason);
3627 if (*exceptionObject == NULL)
3628 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003629
3630 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3631 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003634 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003635 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003636 }
3637 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003639
3640 /* Copy back the bytes variables, which might have been modified by the
3641 callback */
3642 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3643 if (!inputobj)
3644 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003645 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003646 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003647 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003648 *input = PyBytes_AS_STRING(inputobj);
3649 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003650 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003651 /* we can DECREF safely, as the exception has another reference,
3652 so the object won't go away. */
3653 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003656 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003657 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003658 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3659 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003661
3662 /* need more space? (at least enough for what we
3663 have+the replacement+the rest of the string (starting
3664 at the new input position), so we won't have to check space
3665 when there are no errors in the rest of the string) */
3666 repptr = PyUnicode_AS_UNICODE(repunicode);
3667 repsize = PyUnicode_GET_SIZE(repunicode);
3668 requiredsize = *outpos + repsize + insize-newpos;
3669 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003670 if (requiredsize<2*outsize)
3671 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003672 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003673 goto onError;
3674 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 }
3676 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003677 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 Py_UNICODE_COPY(*outptr, repptr, repsize);
3679 *outptr += repsize;
3680 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 /* we made it! */
3683 res = 0;
3684
Benjamin Peterson29060642009-01-31 22:14:21 +00003685 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 Py_XDECREF(restuple);
3687 return res;
3688}
3689
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003690/* --- UTF-7 Codec -------------------------------------------------------- */
3691
Antoine Pitrou244651a2009-05-04 18:56:13 +00003692/* See RFC2152 for details. We encode conservatively and decode liberally. */
3693
3694/* Three simple macros defining base-64. */
3695
3696/* Is c a base-64 character? */
3697
3698#define IS_BASE64(c) \
3699 (((c) >= 'A' && (c) <= 'Z') || \
3700 ((c) >= 'a' && (c) <= 'z') || \
3701 ((c) >= '0' && (c) <= '9') || \
3702 (c) == '+' || (c) == '/')
3703
3704/* given that c is a base-64 character, what is its base-64 value? */
3705
3706#define FROM_BASE64(c) \
3707 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3708 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3709 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3710 (c) == '+' ? 62 : 63)
3711
3712/* What is the base-64 character of the bottom 6 bits of n? */
3713
3714#define TO_BASE64(n) \
3715 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3716
3717/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3718 * decoded as itself. We are permissive on decoding; the only ASCII
3719 * byte not decoding to itself is the + which begins a base64
3720 * string. */
3721
3722#define DECODE_DIRECT(c) \
3723 ((c) <= 127 && (c) != '+')
3724
3725/* The UTF-7 encoder treats ASCII characters differently according to
3726 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3727 * the above). See RFC2152. This array identifies these different
3728 * sets:
3729 * 0 : "Set D"
3730 * alphanumeric and '(),-./:?
3731 * 1 : "Set O"
3732 * !"#$%&*;<=>@[]^_`{|}
3733 * 2 : "whitespace"
3734 * ht nl cr sp
3735 * 3 : special (must be base64 encoded)
3736 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3737 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003738
Tim Petersced69f82003-09-16 20:30:58 +00003739static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003740char utf7_category[128] = {
3741/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3742 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3743/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3744 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3745/* sp ! " # $ % & ' ( ) * + , - . / */
3746 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3747/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3749/* @ A B C D E F G H I J K L M N O */
3750 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3751/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3753/* ` a b c d e f g h i j k l m n o */
3754 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3755/* p q r s t u v w x y z { | } ~ del */
3756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003757};
3758
Antoine Pitrou244651a2009-05-04 18:56:13 +00003759/* ENCODE_DIRECT: this character should be encoded as itself. The
3760 * answer depends on whether we are encoding set O as itself, and also
3761 * on whether we are encoding whitespace as itself. RFC2152 makes it
3762 * clear that the answers to these questions vary between
3763 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003764
Antoine Pitrou244651a2009-05-04 18:56:13 +00003765#define ENCODE_DIRECT(c, directO, directWS) \
3766 ((c) < 128 && (c) > 0 && \
3767 ((utf7_category[(c)] == 0) || \
3768 (directWS && (utf7_category[(c)] == 2)) || \
3769 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003770
Alexander Belopolsky40018472011-02-26 01:02:56 +00003771PyObject *
3772PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003773 Py_ssize_t size,
3774 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003775{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003776 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3777}
3778
Antoine Pitrou244651a2009-05-04 18:56:13 +00003779/* The decoder. The only state we preserve is our read position,
3780 * i.e. how many characters we have consumed. So if we end in the
3781 * middle of a shift sequence we have to back off the read position
3782 * and the output to the beginning of the sequence, otherwise we lose
3783 * all the shift state (seen bits, number of bits seen, high
3784 * surrogate). */
3785
Alexander Belopolsky40018472011-02-26 01:02:56 +00003786PyObject *
3787PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003788 Py_ssize_t size,
3789 const char *errors,
3790 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003791{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003792 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003793 Py_ssize_t startinpos;
3794 Py_ssize_t endinpos;
3795 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003796 const char *e;
3797 PyUnicodeObject *unicode;
3798 Py_UNICODE *p;
3799 const char *errmsg = "";
3800 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003801 Py_UNICODE *shiftOutStart;
3802 unsigned int base64bits = 0;
3803 unsigned long base64buffer = 0;
3804 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003805 PyObject *errorHandler = NULL;
3806 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003807
3808 unicode = _PyUnicode_New(size);
3809 if (!unicode)
3810 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003811 if (size == 0) {
3812 if (consumed)
3813 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003814 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003815 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819 e = s + size;
3820
3821 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003822 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003823 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003824 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825
Antoine Pitrou244651a2009-05-04 18:56:13 +00003826 if (inShift) { /* in a base-64 section */
3827 if (IS_BASE64(ch)) { /* consume a base-64 character */
3828 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3829 base64bits += 6;
3830 s++;
3831 if (base64bits >= 16) {
3832 /* we have enough bits for a UTF-16 value */
3833 Py_UNICODE outCh = (Py_UNICODE)
3834 (base64buffer >> (base64bits-16));
3835 base64bits -= 16;
3836 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3837 if (surrogate) {
3838 /* expecting a second surrogate */
3839 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3840#ifdef Py_UNICODE_WIDE
3841 *p++ = (((surrogate & 0x3FF)<<10)
3842 | (outCh & 0x3FF)) + 0x10000;
3843#else
3844 *p++ = surrogate;
3845 *p++ = outCh;
3846#endif
3847 surrogate = 0;
3848 }
3849 else {
3850 surrogate = 0;
3851 errmsg = "second surrogate missing";
3852 goto utf7Error;
3853 }
3854 }
3855 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3856 /* first surrogate */
3857 surrogate = outCh;
3858 }
3859 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3860 errmsg = "unexpected second surrogate";
3861 goto utf7Error;
3862 }
3863 else {
3864 *p++ = outCh;
3865 }
3866 }
3867 }
3868 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003869 inShift = 0;
3870 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003871 if (surrogate) {
3872 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003873 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003874 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003875 if (base64bits > 0) { /* left-over bits */
3876 if (base64bits >= 6) {
3877 /* We've seen at least one base-64 character */
3878 errmsg = "partial character in shift sequence";
3879 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003880 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003881 else {
3882 /* Some bits remain; they should be zero */
3883 if (base64buffer != 0) {
3884 errmsg = "non-zero padding bits in shift sequence";
3885 goto utf7Error;
3886 }
3887 }
3888 }
3889 if (ch != '-') {
3890 /* '-' is absorbed; other terminating
3891 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892 *p++ = ch;
3893 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003894 }
3895 }
3896 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003897 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003898 s++; /* consume '+' */
3899 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003900 s++;
3901 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 }
3903 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003904 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 shiftOutStart = p;
3906 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003907 }
3908 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003910 *p++ = ch;
3911 s++;
3912 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003913 else {
3914 startinpos = s-starts;
3915 s++;
3916 errmsg = "unexpected special character";
3917 goto utf7Error;
3918 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003919 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003920utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003921 outpos = p-PyUnicode_AS_UNICODE(unicode);
3922 endinpos = s-starts;
3923 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003924 errors, &errorHandler,
3925 "utf7", errmsg,
3926 &starts, &e, &startinpos, &endinpos, &exc, &s,
3927 &unicode, &outpos, &p))
3928 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003929 }
3930
Antoine Pitrou244651a2009-05-04 18:56:13 +00003931 /* end of string */
3932
3933 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3934 /* if we're in an inconsistent state, that's an error */
3935 if (surrogate ||
3936 (base64bits >= 6) ||
3937 (base64bits > 0 && base64buffer != 0)) {
3938 outpos = p-PyUnicode_AS_UNICODE(unicode);
3939 endinpos = size;
3940 if (unicode_decode_call_errorhandler(
3941 errors, &errorHandler,
3942 "utf7", "unterminated shift sequence",
3943 &starts, &e, &startinpos, &endinpos, &exc, &s,
3944 &unicode, &outpos, &p))
3945 goto onError;
3946 if (s < e)
3947 goto restart;
3948 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003949 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003950
3951 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003952 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003953 if (inShift) {
3954 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003955 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003956 }
3957 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003958 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003959 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003960 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003961
Victor Stinnerfe226c02011-10-03 03:52:20 +02003962 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003963 goto onError;
3964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 Py_XDECREF(errorHandler);
3966 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003967#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003968 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969 Py_DECREF(unicode);
3970 return NULL;
3971 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003972#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003973 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003974 return (PyObject *)unicode;
3975
Benjamin Peterson29060642009-01-31 22:14:21 +00003976 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977 Py_XDECREF(errorHandler);
3978 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003979 Py_DECREF(unicode);
3980 return NULL;
3981}
3982
3983
Alexander Belopolsky40018472011-02-26 01:02:56 +00003984PyObject *
3985PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003986 Py_ssize_t size,
3987 int base64SetO,
3988 int base64WhiteSpace,
3989 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003991 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003992 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003993 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003994 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003995 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003996 unsigned int base64bits = 0;
3997 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 char * out;
3999 char * start;
4000
4001 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004002 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004003
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00004004 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004005 return PyErr_NoMemory();
4006
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004008 if (v == NULL)
4009 return NULL;
4010
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004011 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004012 for (;i < size; ++i) {
4013 Py_UNICODE ch = s[i];
4014
Antoine Pitrou244651a2009-05-04 18:56:13 +00004015 if (inShift) {
4016 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4017 /* shifting out */
4018 if (base64bits) { /* output remaining bits */
4019 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4020 base64buffer = 0;
4021 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004022 }
4023 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004024 /* Characters not in the BASE64 set implicitly unshift the sequence
4025 so no '-' is required, except if the character is itself a '-' */
4026 if (IS_BASE64(ch) || ch == '-') {
4027 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004028 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004029 *out++ = (char) ch;
4030 }
4031 else {
4032 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004033 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004034 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004035 else { /* not in a shift sequence */
4036 if (ch == '+') {
4037 *out++ = '+';
4038 *out++ = '-';
4039 }
4040 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4041 *out++ = (char) ch;
4042 }
4043 else {
4044 *out++ = '+';
4045 inShift = 1;
4046 goto encode_char;
4047 }
4048 }
4049 continue;
4050encode_char:
4051#ifdef Py_UNICODE_WIDE
4052 if (ch >= 0x10000) {
4053 /* code first surrogate */
4054 base64bits += 16;
4055 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4056 while (base64bits >= 6) {
4057 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4058 base64bits -= 6;
4059 }
4060 /* prepare second surrogate */
4061 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4062 }
4063#endif
4064 base64bits += 16;
4065 base64buffer = (base64buffer << 16) | ch;
4066 while (base64bits >= 6) {
4067 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4068 base64bits -= 6;
4069 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004070 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004071 if (base64bits)
4072 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4073 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004074 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004075 if (_PyBytes_Resize(&v, out - start) < 0)
4076 return NULL;
4077 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078}
4079
Antoine Pitrou244651a2009-05-04 18:56:13 +00004080#undef IS_BASE64
4081#undef FROM_BASE64
4082#undef TO_BASE64
4083#undef DECODE_DIRECT
4084#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086/* --- UTF-8 Codec -------------------------------------------------------- */
4087
Tim Petersced69f82003-09-16 20:30:58 +00004088static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004089char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004090 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4091 illegal prefix. See RFC 3629 for details */
4092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4104 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4105 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4106 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4107 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004108};
4109
Alexander Belopolsky40018472011-02-26 01:02:56 +00004110PyObject *
4111PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004112 Py_ssize_t size,
4113 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114{
Walter Dörwald69652032004-09-07 20:24:22 +00004115 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4116}
4117
Antoine Pitrouab868312009-01-10 15:40:25 +00004118/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4119#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4120
4121/* Mask to quickly check whether a C 'long' contains a
4122 non-ASCII, UTF8-encoded char. */
4123#if (SIZEOF_LONG == 8)
4124# define ASCII_CHAR_MASK 0x8080808080808080L
4125#elif (SIZEOF_LONG == 4)
4126# define ASCII_CHAR_MASK 0x80808080L
4127#else
4128# error C 'long' size should be either 4 or 8!
4129#endif
4130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004131/* Scans a UTF-8 string and returns the maximum character to be expected,
4132 the size of the decoded unicode string and if any major errors were
4133 encountered.
4134
4135 This function does check basic UTF-8 sanity, it does however NOT CHECK
4136 if the string contains surrogates, and if all continuation bytes are
4137 within the correct ranges, these checks are performed in
4138 PyUnicode_DecodeUTF8Stateful.
4139
4140 If it sets has_errors to 1, it means the value of unicode_size and max_char
4141 will be bogus and you should not rely on useful information in them.
4142 */
4143static Py_UCS4
4144utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4145 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4146 int *has_errors)
4147{
4148 Py_ssize_t n;
4149 Py_ssize_t char_count = 0;
4150 Py_UCS4 max_char = 127, new_max;
4151 Py_UCS4 upper_bound;
4152 const unsigned char *p = (const unsigned char *)s;
4153 const unsigned char *end = p + string_size;
4154 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4155 int err = 0;
4156
4157 for (; p < end && !err; ++p, ++char_count) {
4158 /* Only check value if it's not a ASCII char... */
4159 if (*p < 0x80) {
4160 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4161 an explanation. */
4162 if (!((size_t) p & LONG_PTR_MASK)) {
4163 /* Help register allocation */
4164 register const unsigned char *_p = p;
4165 while (_p < aligned_end) {
4166 unsigned long value = *(unsigned long *) _p;
4167 if (value & ASCII_CHAR_MASK)
4168 break;
4169 _p += SIZEOF_LONG;
4170 char_count += SIZEOF_LONG;
4171 }
4172 p = _p;
4173 if (p == end)
4174 break;
4175 }
4176 }
4177 if (*p >= 0x80) {
4178 n = utf8_code_length[*p];
4179 new_max = max_char;
4180 switch (n) {
4181 /* invalid start byte */
4182 case 0:
4183 err = 1;
4184 break;
4185 case 2:
4186 /* Code points between 0x00FF and 0x07FF inclusive.
4187 Approximate the upper bound of the code point,
4188 if this flips over 255 we can be sure it will be more
4189 than 255 and the string will need 2 bytes per code coint,
4190 if it stays under or equal to 255, we can be sure 1 byte
4191 is enough.
4192 ((*p & 0b00011111) << 6) | 0b00111111 */
4193 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4194 if (max_char < upper_bound)
4195 new_max = upper_bound;
4196 /* Ensure we track at least that we left ASCII space. */
4197 if (new_max < 128)
4198 new_max = 128;
4199 break;
4200 case 3:
4201 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4202 always > 255 and <= 65535 and will always need 2 bytes. */
4203 if (max_char < 65535)
4204 new_max = 65535;
4205 break;
4206 case 4:
4207 /* Code point will be above 0xFFFF for sure in this case. */
4208 new_max = 65537;
4209 break;
4210 /* Internal error, this should be caught by the first if */
4211 case 1:
4212 default:
4213 assert(0 && "Impossible case in utf8_max_char_and_size");
4214 err = 1;
4215 }
4216 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004217 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004218 --n;
4219 /* Check if the follow up chars are all valid continuation bytes */
4220 if (n >= 1) {
4221 const unsigned char *cont;
4222 if ((p + n) >= end) {
4223 if (consumed == 0)
4224 /* incomplete data, non-incremental decoding */
4225 err = 1;
4226 break;
4227 }
4228 for (cont = p + 1; cont < (p + n); ++cont) {
4229 if ((*cont & 0xc0) != 0x80) {
4230 err = 1;
4231 break;
4232 }
4233 }
4234 p += n;
4235 }
4236 else
4237 err = 1;
4238 max_char = new_max;
4239 }
4240 }
4241
4242 if (unicode_size)
4243 *unicode_size = char_count;
4244 if (has_errors)
4245 *has_errors = err;
4246 return max_char;
4247}
4248
4249/* Similar to PyUnicode_WRITE but can also write into wstr field
4250 of the legacy unicode representation */
4251#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4252 do { \
4253 const int k_ = (kind); \
4254 if (k_ == PyUnicode_WCHAR_KIND) \
4255 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4256 else if (k_ == PyUnicode_1BYTE_KIND) \
4257 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4258 else if (k_ == PyUnicode_2BYTE_KIND) \
4259 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4260 else \
4261 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4262 } while (0)
4263
Alexander Belopolsky40018472011-02-26 01:02:56 +00004264PyObject *
4265PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004266 Py_ssize_t size,
4267 const char *errors,
4268 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004269{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004272 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004273 Py_ssize_t startinpos;
4274 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004275 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004277 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 PyObject *errorHandler = NULL;
4279 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004280 Py_UCS4 maxchar = 0;
4281 Py_ssize_t unicode_size;
4282 Py_ssize_t i;
4283 int kind;
4284 void *data;
4285 int has_errors;
4286 Py_UNICODE *error_outptr;
4287#if SIZEOF_WCHAR_T == 2
4288 Py_ssize_t wchar_offset = 0;
4289#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290
Walter Dörwald69652032004-09-07 20:24:22 +00004291 if (size == 0) {
4292 if (consumed)
4293 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004295 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004296 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4297 consumed, &has_errors);
4298 if (has_errors) {
4299 unicode = _PyUnicode_New(size);
4300 if (!unicode)
4301 return NULL;
4302 kind = PyUnicode_WCHAR_KIND;
4303 data = PyUnicode_AS_UNICODE(unicode);
4304 assert(data != NULL);
4305 }
4306 else {
4307 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4308 if (!unicode)
4309 return NULL;
4310 /* When the string is ASCII only, just use memcpy and return.
4311 unicode_size may be != size if there is an incomplete UTF-8
4312 sequence at the end of the ASCII block. */
4313 if (maxchar < 128 && size == unicode_size) {
4314 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4315 return (PyObject *)unicode;
4316 }
4317 kind = PyUnicode_KIND(unicode);
4318 data = PyUnicode_DATA(unicode);
4319 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004321 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004323 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324
4325 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004326 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327
4328 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004329 /* Fast path for runs of ASCII characters. Given that common UTF-8
4330 input will consist of an overwhelming majority of ASCII
4331 characters, we try to optimize for this case by checking
4332 as many characters as a C 'long' can contain.
4333 First, check if we can do an aligned read, as most CPUs have
4334 a penalty for unaligned reads.
4335 */
4336 if (!((size_t) s & LONG_PTR_MASK)) {
4337 /* Help register allocation */
4338 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004340 while (_s < aligned_end) {
4341 /* Read a whole long at a time (either 4 or 8 bytes),
4342 and do a fast unrolled copy if it only contains ASCII
4343 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 unsigned long value = *(unsigned long *) _s;
4345 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004346 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004347 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4348 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4350 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004351#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004352 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4353 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4354 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4355 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004356#endif
4357 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004358 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004359 }
4360 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004361 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004362 if (s == e)
4363 break;
4364 ch = (unsigned char)*s;
4365 }
4366 }
4367
4368 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 s++;
4371 continue;
4372 }
4373
4374 n = utf8_code_length[ch];
4375
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004376 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 if (consumed)
4378 break;
4379 else {
4380 errmsg = "unexpected end of data";
4381 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004382 endinpos = startinpos+1;
4383 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4384 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 goto utf8Error;
4386 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004387 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388
4389 switch (n) {
4390
4391 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004392 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 startinpos = s-starts;
4394 endinpos = startinpos+1;
4395 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396
4397 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004398 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004399 startinpos = s-starts;
4400 endinpos = startinpos+1;
4401 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402
4403 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004404 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004405 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004406 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004407 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004408 goto utf8Error;
4409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004411 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004412 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 break;
4414
4415 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004416 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4417 will result in surrogates in range d800-dfff. Surrogates are
4418 not valid UTF-8 so they are rejected.
4419 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4420 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004421 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004422 (s[2] & 0xc0) != 0x80 ||
4423 ((unsigned char)s[0] == 0xE0 &&
4424 (unsigned char)s[1] < 0xA0) ||
4425 ((unsigned char)s[0] == 0xED &&
4426 (unsigned char)s[1] > 0x9F)) {
4427 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004428 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004429 endinpos = startinpos + 1;
4430
4431 /* if s[1] first two bits are 1 and 0, then the invalid
4432 continuation byte is s[2], so increment endinpos by 1,
4433 if not, s[1] is invalid and endinpos doesn't need to
4434 be incremented. */
4435 if ((s[1] & 0xC0) == 0x80)
4436 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto utf8Error;
4438 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004440 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004441 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004442 break;
4443
4444 case 4:
4445 if ((s[1] & 0xc0) != 0x80 ||
4446 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004447 (s[3] & 0xc0) != 0x80 ||
4448 ((unsigned char)s[0] == 0xF0 &&
4449 (unsigned char)s[1] < 0x90) ||
4450 ((unsigned char)s[0] == 0xF4 &&
4451 (unsigned char)s[1] > 0x8F)) {
4452 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004453 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004454 endinpos = startinpos + 1;
4455 if ((s[1] & 0xC0) == 0x80) {
4456 endinpos++;
4457 if ((s[2] & 0xC0) == 0x80)
4458 endinpos++;
4459 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004460 goto utf8Error;
4461 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004462 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004463 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4464 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004466 /* If the string is flexible or we have native UCS-4, write
4467 directly.. */
4468 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4469 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004470
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004471 else {
4472 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004474 /* translate from 10000..10FFFF to 0..FFFF */
4475 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004476
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004477 /* high surrogate = top 10 bits added to D800 */
4478 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4479 (Py_UNICODE)(0xD800 + (ch >> 10)));
4480
4481 /* low surrogate = bottom 10 bits added to DC00 */
4482 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4483 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4484 }
4485#if SIZEOF_WCHAR_T == 2
4486 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004487#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 }
4490 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004492
Benjamin Peterson29060642009-01-31 22:14:21 +00004493 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004494 /* If this is not yet a resizable string, make it one.. */
4495 if (kind != PyUnicode_WCHAR_KIND) {
4496 const Py_UNICODE *u;
4497 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4498 if (!new_unicode)
4499 goto onError;
4500 u = PyUnicode_AsUnicode((PyObject *)unicode);
4501 if (!u)
4502 goto onError;
4503#if SIZEOF_WCHAR_T == 2
4504 i += wchar_offset;
4505#endif
4506 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4507 Py_DECREF(unicode);
4508 unicode = new_unicode;
4509 kind = 0;
4510 data = PyUnicode_AS_UNICODE(new_unicode);
4511 assert(data != NULL);
4512 }
4513 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 if (unicode_decode_call_errorhandler(
4515 errors, &errorHandler,
4516 "utf8", errmsg,
4517 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004518 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004519 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004520 /* Update data because unicode_decode_call_errorhandler might have
4521 re-created or resized the unicode object. */
4522 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004525 /* Ensure the unicode_size calculation above was correct: */
4526 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4527
Walter Dörwald69652032004-09-07 20:24:22 +00004528 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004529 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004531 /* Adjust length and ready string when it contained errors and
4532 is of the old resizable kind. */
4533 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004534 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004535 goto onError;
4536 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 Py_XDECREF(errorHandler);
4539 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004540#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004541 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004542 Py_DECREF(unicode);
4543 return NULL;
4544 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004545#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004546 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547 return (PyObject *)unicode;
4548
Benjamin Peterson29060642009-01-31 22:14:21 +00004549 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004550 Py_XDECREF(errorHandler);
4551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 Py_DECREF(unicode);
4553 return NULL;
4554}
4555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004556#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004557
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004558#ifdef __APPLE__
4559
4560/* Simplified UTF-8 decoder using surrogateescape error handler,
4561 used to decode the command line arguments on Mac OS X. */
4562
4563wchar_t*
4564_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4565{
4566 int n;
4567 const char *e;
4568 wchar_t *unicode, *p;
4569
4570 /* Note: size will always be longer than the resulting Unicode
4571 character count */
4572 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4573 PyErr_NoMemory();
4574 return NULL;
4575 }
4576 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4577 if (!unicode)
4578 return NULL;
4579
4580 /* Unpack UTF-8 encoded data */
4581 p = unicode;
4582 e = s + size;
4583 while (s < e) {
4584 Py_UCS4 ch = (unsigned char)*s;
4585
4586 if (ch < 0x80) {
4587 *p++ = (wchar_t)ch;
4588 s++;
4589 continue;
4590 }
4591
4592 n = utf8_code_length[ch];
4593 if (s + n > e) {
4594 goto surrogateescape;
4595 }
4596
4597 switch (n) {
4598 case 0:
4599 case 1:
4600 goto surrogateescape;
4601
4602 case 2:
4603 if ((s[1] & 0xc0) != 0x80)
4604 goto surrogateescape;
4605 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4606 assert ((ch > 0x007F) && (ch <= 0x07FF));
4607 *p++ = (wchar_t)ch;
4608 break;
4609
4610 case 3:
4611 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4612 will result in surrogates in range d800-dfff. Surrogates are
4613 not valid UTF-8 so they are rejected.
4614 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4615 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4616 if ((s[1] & 0xc0) != 0x80 ||
4617 (s[2] & 0xc0) != 0x80 ||
4618 ((unsigned char)s[0] == 0xE0 &&
4619 (unsigned char)s[1] < 0xA0) ||
4620 ((unsigned char)s[0] == 0xED &&
4621 (unsigned char)s[1] > 0x9F)) {
4622
4623 goto surrogateescape;
4624 }
4625 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4626 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004627 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004628 break;
4629
4630 case 4:
4631 if ((s[1] & 0xc0) != 0x80 ||
4632 (s[2] & 0xc0) != 0x80 ||
4633 (s[3] & 0xc0) != 0x80 ||
4634 ((unsigned char)s[0] == 0xF0 &&
4635 (unsigned char)s[1] < 0x90) ||
4636 ((unsigned char)s[0] == 0xF4 &&
4637 (unsigned char)s[1] > 0x8F)) {
4638 goto surrogateescape;
4639 }
4640 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4641 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4642 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4643
4644#if SIZEOF_WCHAR_T == 4
4645 *p++ = (wchar_t)ch;
4646#else
4647 /* compute and append the two surrogates: */
4648
4649 /* translate from 10000..10FFFF to 0..FFFF */
4650 ch -= 0x10000;
4651
4652 /* high surrogate = top 10 bits added to D800 */
4653 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4654
4655 /* low surrogate = bottom 10 bits added to DC00 */
4656 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4657#endif
4658 break;
4659 }
4660 s += n;
4661 continue;
4662
4663 surrogateescape:
4664 *p++ = 0xDC00 + ch;
4665 s++;
4666 }
4667 *p = L'\0';
4668 return unicode;
4669}
4670
4671#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004672
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673/* Primary internal function which creates utf8 encoded bytes objects.
4674
4675 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004676 and allocate exactly as much space needed at the end. Else allocate the
4677 maximum possible needed (4 result bytes per Unicode character), and return
4678 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004679*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004680PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004681_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682{
Tim Peters602f7402002-04-27 18:03:26 +00004683#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004684
Guido van Rossum98297ee2007-11-06 21:34:58 +00004685 Py_ssize_t i; /* index into s of next input byte */
4686 PyObject *result; /* result string object */
4687 char *p; /* next free byte in output buffer */
4688 Py_ssize_t nallocated; /* number of result bytes allocated */
4689 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004690 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004691 PyObject *errorHandler = NULL;
4692 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 int kind;
4694 void *data;
4695 Py_ssize_t size;
4696 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4697#if SIZEOF_WCHAR_T == 2
4698 Py_ssize_t wchar_offset = 0;
4699#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004701 if (!PyUnicode_Check(unicode)) {
4702 PyErr_BadArgument();
4703 return NULL;
4704 }
4705
4706 if (PyUnicode_READY(unicode) == -1)
4707 return NULL;
4708
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004709 if (PyUnicode_UTF8(unicode))
4710 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4711 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004712
4713 kind = PyUnicode_KIND(unicode);
4714 data = PyUnicode_DATA(unicode);
4715 size = PyUnicode_GET_LENGTH(unicode);
4716
Tim Peters602f7402002-04-27 18:03:26 +00004717 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
Tim Peters602f7402002-04-27 18:03:26 +00004719 if (size <= MAX_SHORT_UNICHARS) {
4720 /* Write into the stack buffer; nallocated can't overflow.
4721 * At the end, we'll allocate exactly as much heap space as it
4722 * turns out we need.
4723 */
4724 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004725 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004726 p = stackbuf;
4727 }
4728 else {
4729 /* Overallocate on the heap, and give the excess back at the end. */
4730 nallocated = size * 4;
4731 if (nallocated / 4 != size) /* overflow! */
4732 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004733 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004734 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004735 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004736 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004737 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004738
Tim Peters602f7402002-04-27 18:03:26 +00004739 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004740 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004741
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004742 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004743 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004745
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004747 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004748 *p++ = (char)(0xc0 | (ch >> 6));
4749 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004750 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004751 Py_ssize_t newpos;
4752 PyObject *rep;
4753 Py_ssize_t repsize, k, startpos;
4754 startpos = i-1;
4755#if SIZEOF_WCHAR_T == 2
4756 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004757#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004758 rep = unicode_encode_call_errorhandler(
4759 errors, &errorHandler, "utf-8", "surrogates not allowed",
4760 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4761 &exc, startpos, startpos+1, &newpos);
4762 if (!rep)
4763 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004764
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004765 if (PyBytes_Check(rep))
4766 repsize = PyBytes_GET_SIZE(rep);
4767 else
4768 repsize = PyUnicode_GET_SIZE(rep);
4769
4770 if (repsize > 4) {
4771 Py_ssize_t offset;
4772
4773 if (result == NULL)
4774 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004775 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004776 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004777
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004778 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4779 /* integer overflow */
4780 PyErr_NoMemory();
4781 goto error;
4782 }
4783 nallocated += repsize - 4;
4784 if (result != NULL) {
4785 if (_PyBytes_Resize(&result, nallocated) < 0)
4786 goto error;
4787 } else {
4788 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004789 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004790 goto error;
4791 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4792 }
4793 p = PyBytes_AS_STRING(result) + offset;
4794 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004796 if (PyBytes_Check(rep)) {
4797 char *prep = PyBytes_AS_STRING(rep);
4798 for(k = repsize; k > 0; k--)
4799 *p++ = *prep++;
4800 } else /* rep is unicode */ {
4801 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4802 Py_UNICODE c;
4803
4804 for(k=0; k<repsize; k++) {
4805 c = prep[k];
4806 if (0x80 <= c) {
4807 raise_encode_exception(&exc, "utf-8",
4808 PyUnicode_AS_UNICODE(unicode),
4809 size, i-1, i,
4810 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004811 goto error;
4812 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004814 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004815 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004817 } else if (ch < 0x10000) {
4818 *p++ = (char)(0xe0 | (ch >> 12));
4819 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4820 *p++ = (char)(0x80 | (ch & 0x3f));
4821 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004822 /* Encode UCS4 Unicode ordinals */
4823 *p++ = (char)(0xf0 | (ch >> 18));
4824 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4825 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4826 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004827#if SIZEOF_WCHAR_T == 2
4828 wchar_offset++;
4829#endif
Tim Peters602f7402002-04-27 18:03:26 +00004830 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004832
Guido van Rossum98297ee2007-11-06 21:34:58 +00004833 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004834 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004835 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004836 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004837 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004838 }
4839 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004840 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004841 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004842 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004843 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004846 Py_XDECREF(errorHandler);
4847 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004848 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004849 error:
4850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
4852 Py_XDECREF(result);
4853 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004854
Tim Peters602f7402002-04-27 18:03:26 +00004855#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856}
4857
Alexander Belopolsky40018472011-02-26 01:02:56 +00004858PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004859PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4860 Py_ssize_t size,
4861 const char *errors)
4862{
4863 PyObject *v, *unicode;
4864
4865 unicode = PyUnicode_FromUnicode(s, size);
4866 if (unicode == NULL)
4867 return NULL;
4868 v = _PyUnicode_AsUTF8String(unicode, errors);
4869 Py_DECREF(unicode);
4870 return v;
4871}
4872
4873PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004874PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004876 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877}
4878
Walter Dörwald41980ca2007-08-16 21:55:45 +00004879/* --- UTF-32 Codec ------------------------------------------------------- */
4880
4881PyObject *
4882PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004883 Py_ssize_t size,
4884 const char *errors,
4885 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004886{
4887 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4888}
4889
4890PyObject *
4891PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004892 Py_ssize_t size,
4893 const char *errors,
4894 int *byteorder,
4895 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896{
4897 const char *starts = s;
4898 Py_ssize_t startinpos;
4899 Py_ssize_t endinpos;
4900 Py_ssize_t outpos;
4901 PyUnicodeObject *unicode;
4902 Py_UNICODE *p;
4903#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004904 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004905 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906#else
4907 const int pairs = 0;
4908#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004909 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910 int bo = 0; /* assume native ordering by default */
4911 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004912 /* Offsets from q for retrieving bytes in the right order. */
4913#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4914 int iorder[] = {0, 1, 2, 3};
4915#else
4916 int iorder[] = {3, 2, 1, 0};
4917#endif
4918 PyObject *errorHandler = NULL;
4919 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004920
Walter Dörwald41980ca2007-08-16 21:55:45 +00004921 q = (unsigned char *)s;
4922 e = q + size;
4923
4924 if (byteorder)
4925 bo = *byteorder;
4926
4927 /* Check for BOM marks (U+FEFF) in the input and adjust current
4928 byte order setting accordingly. In native mode, the leading BOM
4929 mark is skipped, in all other modes, it is copied to the output
4930 stream as-is (giving a ZWNBSP character). */
4931 if (bo == 0) {
4932 if (size >= 4) {
4933 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004935#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004936 if (bom == 0x0000FEFF) {
4937 q += 4;
4938 bo = -1;
4939 }
4940 else if (bom == 0xFFFE0000) {
4941 q += 4;
4942 bo = 1;
4943 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004944#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004945 if (bom == 0x0000FEFF) {
4946 q += 4;
4947 bo = 1;
4948 }
4949 else if (bom == 0xFFFE0000) {
4950 q += 4;
4951 bo = -1;
4952 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004953#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004955 }
4956
4957 if (bo == -1) {
4958 /* force LE */
4959 iorder[0] = 0;
4960 iorder[1] = 1;
4961 iorder[2] = 2;
4962 iorder[3] = 3;
4963 }
4964 else if (bo == 1) {
4965 /* force BE */
4966 iorder[0] = 3;
4967 iorder[1] = 2;
4968 iorder[2] = 1;
4969 iorder[3] = 0;
4970 }
4971
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004972 /* On narrow builds we split characters outside the BMP into two
4973 codepoints => count how much extra space we need. */
4974#ifndef Py_UNICODE_WIDE
4975 for (qq = q; qq < e; qq += 4)
4976 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4977 pairs++;
4978#endif
4979
4980 /* This might be one to much, because of a BOM */
4981 unicode = _PyUnicode_New((size+3)/4+pairs);
4982 if (!unicode)
4983 return NULL;
4984 if (size == 0)
4985 return (PyObject *)unicode;
4986
4987 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004988 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004989
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004991 Py_UCS4 ch;
4992 /* remaining bytes at the end? (size should be divisible by 4) */
4993 if (e-q<4) {
4994 if (consumed)
4995 break;
4996 errmsg = "truncated data";
4997 startinpos = ((const char *)q)-starts;
4998 endinpos = ((const char *)e)-starts;
4999 goto utf32Error;
5000 /* The remaining input chars are ignored if the callback
5001 chooses to skip the input */
5002 }
5003 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5004 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 if (ch >= 0x110000)
5007 {
5008 errmsg = "codepoint not in range(0x110000)";
5009 startinpos = ((const char *)q)-starts;
5010 endinpos = startinpos+4;
5011 goto utf32Error;
5012 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005013#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 if (ch >= 0x10000)
5015 {
5016 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5017 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5018 }
5019 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005020#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005021 *p++ = ch;
5022 q += 4;
5023 continue;
5024 utf32Error:
5025 outpos = p-PyUnicode_AS_UNICODE(unicode);
5026 if (unicode_decode_call_errorhandler(
5027 errors, &errorHandler,
5028 "utf32", errmsg,
5029 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5030 &unicode, &outpos, &p))
5031 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005032 }
5033
5034 if (byteorder)
5035 *byteorder = bo;
5036
5037 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039
5040 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005041 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005042 goto onError;
5043
5044 Py_XDECREF(errorHandler);
5045 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005046#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005047 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005048 Py_DECREF(unicode);
5049 return NULL;
5050 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005051#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005052 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005053 return (PyObject *)unicode;
5054
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056 Py_DECREF(unicode);
5057 Py_XDECREF(errorHandler);
5058 Py_XDECREF(exc);
5059 return NULL;
5060}
5061
5062PyObject *
5063PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005064 Py_ssize_t size,
5065 const char *errors,
5066 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005068 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005070 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005072 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073#else
5074 const int pairs = 0;
5075#endif
5076 /* Offsets from p for storing byte pairs in the right order. */
5077#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5078 int iorder[] = {0, 1, 2, 3};
5079#else
5080 int iorder[] = {3, 2, 1, 0};
5081#endif
5082
Benjamin Peterson29060642009-01-31 22:14:21 +00005083#define STORECHAR(CH) \
5084 do { \
5085 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5086 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5087 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5088 p[iorder[0]] = (CH) & 0xff; \
5089 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 } while(0)
5091
5092 /* In narrow builds we can output surrogate pairs as one codepoint,
5093 so we need less space. */
5094#ifndef Py_UNICODE_WIDE
5095 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005096 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5097 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5098 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005099#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005100 nsize = (size - pairs + (byteorder == 0));
5101 bytesize = nsize * 4;
5102 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005103 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005104 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105 if (v == NULL)
5106 return NULL;
5107
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005108 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005110 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005112 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113
5114 if (byteorder == -1) {
5115 /* force LE */
5116 iorder[0] = 0;
5117 iorder[1] = 1;
5118 iorder[2] = 2;
5119 iorder[3] = 3;
5120 }
5121 else if (byteorder == 1) {
5122 /* force BE */
5123 iorder[0] = 3;
5124 iorder[1] = 2;
5125 iorder[2] = 1;
5126 iorder[3] = 0;
5127 }
5128
5129 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005130 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005131#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005132 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5133 Py_UCS4 ch2 = *s;
5134 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5135 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5136 s++;
5137 size--;
5138 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005139 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005140#endif
5141 STORECHAR(ch);
5142 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005143
5144 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005145 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005146#undef STORECHAR
5147}
5148
Alexander Belopolsky40018472011-02-26 01:02:56 +00005149PyObject *
5150PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005151{
5152 if (!PyUnicode_Check(unicode)) {
5153 PyErr_BadArgument();
5154 return NULL;
5155 }
5156 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005157 PyUnicode_GET_SIZE(unicode),
5158 NULL,
5159 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005160}
5161
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162/* --- UTF-16 Codec ------------------------------------------------------- */
5163
Tim Peters772747b2001-08-09 22:21:55 +00005164PyObject *
5165PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005166 Py_ssize_t size,
5167 const char *errors,
5168 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Walter Dörwald69652032004-09-07 20:24:22 +00005170 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5171}
5172
Antoine Pitrouab868312009-01-10 15:40:25 +00005173/* Two masks for fast checking of whether a C 'long' may contain
5174 UTF16-encoded surrogate characters. This is an efficient heuristic,
5175 assuming that non-surrogate characters with a code point >= 0x8000 are
5176 rare in most input.
5177 FAST_CHAR_MASK is used when the input is in native byte ordering,
5178 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005179*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005180#if (SIZEOF_LONG == 8)
5181# define FAST_CHAR_MASK 0x8000800080008000L
5182# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5183#elif (SIZEOF_LONG == 4)
5184# define FAST_CHAR_MASK 0x80008000L
5185# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5186#else
5187# error C 'long' size should be either 4 or 8!
5188#endif
5189
Walter Dörwald69652032004-09-07 20:24:22 +00005190PyObject *
5191PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005192 Py_ssize_t size,
5193 const char *errors,
5194 int *byteorder,
5195 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005196{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005197 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005198 Py_ssize_t startinpos;
5199 Py_ssize_t endinpos;
5200 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201 PyUnicodeObject *unicode;
5202 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005203 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005204 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005205 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005206 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005207 /* Offsets from q for retrieving byte pairs in the right order. */
5208#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5209 int ihi = 1, ilo = 0;
5210#else
5211 int ihi = 0, ilo = 1;
5212#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005213 PyObject *errorHandler = NULL;
5214 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
5216 /* Note: size will always be longer than the resulting Unicode
5217 character count */
5218 unicode = _PyUnicode_New(size);
5219 if (!unicode)
5220 return NULL;
5221 if (size == 0)
5222 return (PyObject *)unicode;
5223
5224 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005225 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005226 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005227 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005230 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005232 /* Check for BOM marks (U+FEFF) in the input and adjust current
5233 byte order setting accordingly. In native mode, the leading BOM
5234 mark is skipped, in all other modes, it is copied to the output
5235 stream as-is (giving a ZWNBSP character). */
5236 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005237 if (size >= 2) {
5238 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005240 if (bom == 0xFEFF) {
5241 q += 2;
5242 bo = -1;
5243 }
5244 else if (bom == 0xFFFE) {
5245 q += 2;
5246 bo = 1;
5247 }
Tim Petersced69f82003-09-16 20:30:58 +00005248#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005249 if (bom == 0xFEFF) {
5250 q += 2;
5251 bo = 1;
5252 }
5253 else if (bom == 0xFFFE) {
5254 q += 2;
5255 bo = -1;
5256 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005257#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005258 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260
Tim Peters772747b2001-08-09 22:21:55 +00005261 if (bo == -1) {
5262 /* force LE */
5263 ihi = 1;
5264 ilo = 0;
5265 }
5266 else if (bo == 1) {
5267 /* force BE */
5268 ihi = 0;
5269 ilo = 1;
5270 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005271#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5272 native_ordering = ilo < ihi;
5273#else
5274 native_ordering = ilo > ihi;
5275#endif
Tim Peters772747b2001-08-09 22:21:55 +00005276
Antoine Pitrouab868312009-01-10 15:40:25 +00005277 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005278 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005279 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005280 /* First check for possible aligned read of a C 'long'. Unaligned
5281 reads are more expensive, better to defer to another iteration. */
5282 if (!((size_t) q & LONG_PTR_MASK)) {
5283 /* Fast path for runs of non-surrogate chars. */
5284 register const unsigned char *_q = q;
5285 Py_UNICODE *_p = p;
5286 if (native_ordering) {
5287 /* Native ordering is simple: as long as the input cannot
5288 possibly contain a surrogate char, do an unrolled copy
5289 of several 16-bit code points to the target object.
5290 The non-surrogate check is done on several input bytes
5291 at a time (as many as a C 'long' can contain). */
5292 while (_q < aligned_end) {
5293 unsigned long data = * (unsigned long *) _q;
5294 if (data & FAST_CHAR_MASK)
5295 break;
5296 _p[0] = ((unsigned short *) _q)[0];
5297 _p[1] = ((unsigned short *) _q)[1];
5298#if (SIZEOF_LONG == 8)
5299 _p[2] = ((unsigned short *) _q)[2];
5300 _p[3] = ((unsigned short *) _q)[3];
5301#endif
5302 _q += SIZEOF_LONG;
5303 _p += SIZEOF_LONG / 2;
5304 }
5305 }
5306 else {
5307 /* Byteswapped ordering is similar, but we must decompose
5308 the copy bytewise, and take care of zero'ing out the
5309 upper bytes if the target object is in 32-bit units
5310 (that is, in UCS-4 builds). */
5311 while (_q < aligned_end) {
5312 unsigned long data = * (unsigned long *) _q;
5313 if (data & SWAPPED_FAST_CHAR_MASK)
5314 break;
5315 /* Zero upper bytes in UCS-4 builds */
5316#if (Py_UNICODE_SIZE > 2)
5317 _p[0] = 0;
5318 _p[1] = 0;
5319#if (SIZEOF_LONG == 8)
5320 _p[2] = 0;
5321 _p[3] = 0;
5322#endif
5323#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005324 /* Issue #4916; UCS-4 builds on big endian machines must
5325 fill the two last bytes of each 4-byte unit. */
5326#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5327# define OFF 2
5328#else
5329# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005330#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005331 ((unsigned char *) _p)[OFF + 1] = _q[0];
5332 ((unsigned char *) _p)[OFF + 0] = _q[1];
5333 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5334 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5335#if (SIZEOF_LONG == 8)
5336 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5337 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5338 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5339 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5340#endif
5341#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005342 _q += SIZEOF_LONG;
5343 _p += SIZEOF_LONG / 2;
5344 }
5345 }
5346 p = _p;
5347 q = _q;
5348 if (q >= e)
5349 break;
5350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005351 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005352
Benjamin Peterson14339b62009-01-31 16:36:08 +00005353 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005354
5355 if (ch < 0xD800 || ch > 0xDFFF) {
5356 *p++ = ch;
5357 continue;
5358 }
5359
5360 /* UTF-16 code pair: */
5361 if (q > e) {
5362 errmsg = "unexpected end of data";
5363 startinpos = (((const char *)q) - 2) - starts;
5364 endinpos = ((const char *)e) + 1 - starts;
5365 goto utf16Error;
5366 }
5367 if (0xD800 <= ch && ch <= 0xDBFF) {
5368 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5369 q += 2;
5370 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005371#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 *p++ = ch;
5373 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005374#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005375 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005376#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 continue;
5378 }
5379 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005380 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 startinpos = (((const char *)q)-4)-starts;
5382 endinpos = startinpos+2;
5383 goto utf16Error;
5384 }
5385
Benjamin Peterson14339b62009-01-31 16:36:08 +00005386 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 errmsg = "illegal encoding";
5388 startinpos = (((const char *)q)-2)-starts;
5389 endinpos = startinpos+2;
5390 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005391
Benjamin Peterson29060642009-01-31 22:14:21 +00005392 utf16Error:
5393 outpos = p - PyUnicode_AS_UNICODE(unicode);
5394 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005395 errors,
5396 &errorHandler,
5397 "utf16", errmsg,
5398 &starts,
5399 (const char **)&e,
5400 &startinpos,
5401 &endinpos,
5402 &exc,
5403 (const char **)&q,
5404 &unicode,
5405 &outpos,
5406 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005407 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005409 /* remaining byte at the end? (size should be even) */
5410 if (e == q) {
5411 if (!consumed) {
5412 errmsg = "truncated data";
5413 startinpos = ((const char *)q) - starts;
5414 endinpos = ((const char *)e) + 1 - starts;
5415 outpos = p - PyUnicode_AS_UNICODE(unicode);
5416 if (unicode_decode_call_errorhandler(
5417 errors,
5418 &errorHandler,
5419 "utf16", errmsg,
5420 &starts,
5421 (const char **)&e,
5422 &startinpos,
5423 &endinpos,
5424 &exc,
5425 (const char **)&q,
5426 &unicode,
5427 &outpos,
5428 &p))
5429 goto onError;
5430 /* The remaining input chars are ignored if the callback
5431 chooses to skip the input */
5432 }
5433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434
5435 if (byteorder)
5436 *byteorder = bo;
5437
Walter Dörwald69652032004-09-07 20:24:22 +00005438 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005439 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005440
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005442 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 goto onError;
5444
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 Py_XDECREF(errorHandler);
5446 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005447#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005448 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005449 Py_DECREF(unicode);
5450 return NULL;
5451 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005452#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005453 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 return (PyObject *)unicode;
5455
Benjamin Peterson29060642009-01-31 22:14:21 +00005456 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005458 Py_XDECREF(errorHandler);
5459 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 return NULL;
5461}
5462
Antoine Pitrouab868312009-01-10 15:40:25 +00005463#undef FAST_CHAR_MASK
5464#undef SWAPPED_FAST_CHAR_MASK
5465
Tim Peters772747b2001-08-09 22:21:55 +00005466PyObject *
5467PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t size,
5469 const char *errors,
5470 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005472 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005473 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005474 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005475#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005476 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005477#else
5478 const int pairs = 0;
5479#endif
Tim Peters772747b2001-08-09 22:21:55 +00005480 /* Offsets from p for storing byte pairs in the right order. */
5481#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5482 int ihi = 1, ilo = 0;
5483#else
5484 int ihi = 0, ilo = 1;
5485#endif
5486
Benjamin Peterson29060642009-01-31 22:14:21 +00005487#define STORECHAR(CH) \
5488 do { \
5489 p[ihi] = ((CH) >> 8) & 0xff; \
5490 p[ilo] = (CH) & 0xff; \
5491 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005492 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005494#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005495 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 if (s[i] >= 0x10000)
5497 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005498#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005499 /* 2 * (size + pairs + (byteorder == 0)) */
5500 if (size > PY_SSIZE_T_MAX ||
5501 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005502 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005503 nsize = size + pairs + (byteorder == 0);
5504 bytesize = nsize * 2;
5505 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005507 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 if (v == NULL)
5509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005511 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005514 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005515 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005516
5517 if (byteorder == -1) {
5518 /* force LE */
5519 ihi = 1;
5520 ilo = 0;
5521 }
5522 else if (byteorder == 1) {
5523 /* force BE */
5524 ihi = 0;
5525 ilo = 1;
5526 }
5527
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005528 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 Py_UNICODE ch = *s++;
5530 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005531#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005532 if (ch >= 0x10000) {
5533 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5534 ch = 0xD800 | ((ch-0x10000) >> 10);
5535 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005536#endif
Tim Peters772747b2001-08-09 22:21:55 +00005537 STORECHAR(ch);
5538 if (ch2)
5539 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005540 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005541
5542 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005543 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005544#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545}
5546
Alexander Belopolsky40018472011-02-26 01:02:56 +00005547PyObject *
5548PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549{
5550 if (!PyUnicode_Check(unicode)) {
5551 PyErr_BadArgument();
5552 return NULL;
5553 }
5554 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005555 PyUnicode_GET_SIZE(unicode),
5556 NULL,
5557 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558}
5559
5560/* --- Unicode Escape Codec ----------------------------------------------- */
5561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005562/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5563 if all the escapes in the string make it still a valid ASCII string.
5564 Returns -1 if any escapes were found which cause the string to
5565 pop out of ASCII range. Otherwise returns the length of the
5566 required buffer to hold the string.
5567 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005568static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005569length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5570{
5571 const unsigned char *p = (const unsigned char *)s;
5572 const unsigned char *end = p + size;
5573 Py_ssize_t length = 0;
5574
5575 if (size < 0)
5576 return -1;
5577
5578 for (; p < end; ++p) {
5579 if (*p > 127) {
5580 /* Non-ASCII */
5581 return -1;
5582 }
5583 else if (*p != '\\') {
5584 /* Normal character */
5585 ++length;
5586 }
5587 else {
5588 /* Backslash-escape, check next char */
5589 ++p;
5590 /* Escape sequence reaches till end of string or
5591 non-ASCII follow-up. */
5592 if (p >= end || *p > 127)
5593 return -1;
5594 switch (*p) {
5595 case '\n':
5596 /* backslash + \n result in zero characters */
5597 break;
5598 case '\\': case '\'': case '\"':
5599 case 'b': case 'f': case 't':
5600 case 'n': case 'r': case 'v': case 'a':
5601 ++length;
5602 break;
5603 case '0': case '1': case '2': case '3':
5604 case '4': case '5': case '6': case '7':
5605 case 'x': case 'u': case 'U': case 'N':
5606 /* these do not guarantee ASCII characters */
5607 return -1;
5608 default:
5609 /* count the backslash + the other character */
5610 length += 2;
5611 }
5612 }
5613 }
5614 return length;
5615}
5616
5617/* Similar to PyUnicode_WRITE but either write into wstr field
5618 or treat string as ASCII. */
5619#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5620 do { \
5621 if ((kind) != PyUnicode_WCHAR_KIND) \
5622 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5623 else \
5624 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5625 } while (0)
5626
5627#define WRITE_WSTR(buf, index, value) \
5628 assert(kind == PyUnicode_WCHAR_KIND), \
5629 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5630
5631
Fredrik Lundh06d12682001-01-24 07:59:11 +00005632static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005633
Alexander Belopolsky40018472011-02-26 01:02:56 +00005634PyObject *
5635PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005636 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005637 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005638{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005640 Py_ssize_t startinpos;
5641 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005644 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005646 char* message;
5647 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 PyObject *errorHandler = NULL;
5649 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 Py_ssize_t ascii_length;
5651 Py_ssize_t i;
5652 int kind;
5653 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005654
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 ascii_length = length_of_escaped_ascii_string(s, size);
5656
5657 /* After length_of_escaped_ascii_string() there are two alternatives,
5658 either the string is pure ASCII with named escapes like \n, etc.
5659 and we determined it's exact size (common case)
5660 or it contains \x, \u, ... escape sequences. then we create a
5661 legacy wchar string and resize it at the end of this function. */
5662 if (ascii_length >= 0) {
5663 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5664 if (!v)
5665 goto onError;
5666 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5667 kind = PyUnicode_1BYTE_KIND;
5668 data = PyUnicode_DATA(v);
5669 }
5670 else {
5671 /* Escaped strings will always be longer than the resulting
5672 Unicode string, so we start with size here and then reduce the
5673 length after conversion to the true value.
5674 (but if the error callback returns a long replacement string
5675 we'll have to allocate more space) */
5676 v = _PyUnicode_New(size);
5677 if (!v)
5678 goto onError;
5679 kind = PyUnicode_WCHAR_KIND;
5680 data = PyUnicode_AS_UNICODE(v);
5681 }
5682
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (size == 0)
5684 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 while (s < end) {
5689 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005690 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005691 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005693 if (kind == PyUnicode_WCHAR_KIND) {
5694 assert(i < _PyUnicode_WSTR_LENGTH(v));
5695 }
5696 else {
5697 /* The only case in which i == ascii_length is a backslash
5698 followed by a newline. */
5699 assert(i <= ascii_length);
5700 }
5701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 /* Non-escape characters are interpreted as Unicode ordinals */
5703 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 continue;
5706 }
5707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005708 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 /* \ - Escapes */
5710 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005711 c = *s++;
5712 if (s > end)
5713 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005714
5715 if (kind == PyUnicode_WCHAR_KIND) {
5716 assert(i < _PyUnicode_WSTR_LENGTH(v));
5717 }
5718 else {
5719 /* The only case in which i == ascii_length is a backslash
5720 followed by a newline. */
5721 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5722 }
5723
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005724 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005728 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5729 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5730 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5731 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5732 /* FF */
5733 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5734 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5735 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5736 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5737 /* VT */
5738 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5739 /* BEL, not classic C */
5740 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 case '0': case '1': case '2': case '3':
5744 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005745 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005746 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005747 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005748 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005749 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005751 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752 break;
5753
Benjamin Peterson29060642009-01-31 22:14:21 +00005754 /* hex escapes */
5755 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005757 digits = 2;
5758 message = "truncated \\xXX escape";
5759 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760
Benjamin Peterson29060642009-01-31 22:14:21 +00005761 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005763 digits = 4;
5764 message = "truncated \\uXXXX escape";
5765 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766
Benjamin Peterson29060642009-01-31 22:14:21 +00005767 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005768 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005769 digits = 8;
5770 message = "truncated \\UXXXXXXXX escape";
5771 hexescape:
5772 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005773 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005774 if (s+digits>end) {
5775 endinpos = size;
5776 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005777 errors, &errorHandler,
5778 "unicodeescape", "end of string in escape sequence",
5779 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005781 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005782 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005783 goto nextByte;
5784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005785 for (j = 0; j < digits; ++j) {
5786 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005787 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 endinpos = (s+j+1)-starts;
5789 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005790 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005791 errors, &errorHandler,
5792 "unicodeescape", message,
5793 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005794 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005795 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005796 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005798 }
5799 chr = (chr<<4) & ~0xF;
5800 if (c >= '0' && c <= '9')
5801 chr += c - '0';
5802 else if (c >= 'a' && c <= 'f')
5803 chr += 10 + c - 'a';
5804 else
5805 chr += 10 + c - 'A';
5806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005807 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005808 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 /* _decoding_error will have already written into the
5810 target buffer. */
5811 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005813 /* when we get here, chr is a 32-bit unicode character */
5814 if (chr <= 0xffff)
5815 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005817 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005818 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005819 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005820#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005821 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005822#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005823 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005824 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5825 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005826#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005827 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005828 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005829 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005830 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 errors, &errorHandler,
5832 "unicodeescape", "illegal Unicode character",
5833 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005834 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005835 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005836 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005837 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005838 break;
5839
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005841 case 'N':
5842 message = "malformed \\N character escape";
5843 if (ucnhash_CAPI == NULL) {
5844 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005845 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5846 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005847 if (ucnhash_CAPI == NULL)
5848 goto ucnhashError;
5849 }
5850 if (*s == '{') {
5851 const char *start = s+1;
5852 /* look for the closing brace */
5853 while (*s != '}' && s < end)
5854 s++;
5855 if (s > start && s < end && *s == '}') {
5856 /* found a name. look it up in the unicode database */
5857 message = "unknown Unicode character name";
5858 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005859 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005860 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005861 goto store;
5862 }
5863 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005864 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005865 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005866 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005867 errors, &errorHandler,
5868 "unicodeescape", message,
5869 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005870 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005871 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005872 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005873 break;
5874
5875 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005876 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005877 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005878 message = "\\ at end of string";
5879 s--;
5880 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005883 errors, &errorHandler,
5884 "unicodeescape", message,
5885 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005886 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005887 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005888 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005889 }
5890 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005891 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5892 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005893 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005894 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005897 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005899 /* Ensure the length prediction worked in case of ASCII strings */
5900 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5901
Victor Stinnerfe226c02011-10-03 03:52:20 +02005902 if (kind == PyUnicode_WCHAR_KIND)
5903 {
5904 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5905 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005906 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005907 Py_XDECREF(errorHandler);
5908 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005909#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005910 if (_PyUnicode_READY_REPLACE(&v)) {
5911 Py_DECREF(v);
5912 return NULL;
5913 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005914#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005915 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005917
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005919 PyErr_SetString(
5920 PyExc_UnicodeError,
5921 "\\N escapes not supported (can't load unicodedata module)"
5922 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005923 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005924 Py_XDECREF(errorHandler);
5925 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005926 return NULL;
5927
Benjamin Peterson29060642009-01-31 22:14:21 +00005928 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005930 Py_XDECREF(errorHandler);
5931 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 return NULL;
5933}
5934
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005935#undef WRITE_ASCII_OR_WSTR
5936#undef WRITE_WSTR
5937
Guido van Rossumd57fd912000-03-10 22:53:23 +00005938/* Return a Unicode-Escape string version of the Unicode object.
5939
5940 If quotes is true, the string is enclosed in u"" or u'' quotes as
5941 appropriate.
5942
5943*/
5944
Alexander Belopolsky40018472011-02-26 01:02:56 +00005945PyObject *
5946PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005947 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005948{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005949 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005952#ifdef Py_UNICODE_WIDE
5953 const Py_ssize_t expandsize = 10;
5954#else
5955 const Py_ssize_t expandsize = 6;
5956#endif
5957
Thomas Wouters89f507f2006-12-13 04:49:30 +00005958 /* XXX(nnorwitz): rather than over-allocating, it would be
5959 better to choose a different scheme. Perhaps scan the
5960 first N-chars of the string and allocate based on that size.
5961 */
5962 /* Initial allocation is based on the longest-possible unichr
5963 escape.
5964
5965 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5966 unichr, so in this case it's the longest unichr escape. In
5967 narrow (UTF-16) builds this is five chars per source unichr
5968 since there are two unichrs in the surrogate pair, so in narrow
5969 (UTF-16) builds it's not the longest unichr escape.
5970
5971 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5972 so in the narrow (UTF-16) build case it's the longest unichr
5973 escape.
5974 */
5975
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005976 if (size == 0)
5977 return PyBytes_FromStringAndSize(NULL, 0);
5978
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005979 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005980 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005982 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 2
5984 + expandsize*size
5985 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986 if (repr == NULL)
5987 return NULL;
5988
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005989 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 while (size-- > 0) {
5992 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005993
Walter Dörwald79e913e2007-05-12 11:08:06 +00005994 /* Escape backslashes */
5995 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 *p++ = '\\';
5997 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005998 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005999 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006000
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00006001#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006002 /* Map 21-bit characters to '\U00xxxxxx' */
6003 else if (ch >= 0x10000) {
6004 *p++ = '\\';
6005 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006006 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6007 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6008 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6009 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6010 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6011 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6012 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6013 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006014 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006015 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006016#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6018 else if (ch >= 0xD800 && ch < 0xDC00) {
6019 Py_UNICODE ch2;
6020 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 ch2 = *s++;
6023 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006024 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6026 *p++ = '\\';
6027 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006028 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6029 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6030 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6031 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6032 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6033 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6034 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6035 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 continue;
6037 }
6038 /* Fall through: isolated surrogates are copied as-is */
6039 s--;
6040 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006041 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006042#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006043
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006045 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 *p++ = '\\';
6047 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006048 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6049 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6050 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6051 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006053
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006054 /* Map special whitespace to '\t', \n', '\r' */
6055 else if (ch == '\t') {
6056 *p++ = '\\';
6057 *p++ = 't';
6058 }
6059 else if (ch == '\n') {
6060 *p++ = '\\';
6061 *p++ = 'n';
6062 }
6063 else if (ch == '\r') {
6064 *p++ = '\\';
6065 *p++ = 'r';
6066 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006067
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006068 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006069 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006071 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006072 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6073 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006074 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006075
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076 /* Copy everything else as-is */
6077 else
6078 *p++ = (char) ch;
6079 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006081 assert(p - PyBytes_AS_STRING(repr) > 0);
6082 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6083 return NULL;
6084 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085}
6086
Alexander Belopolsky40018472011-02-26 01:02:56 +00006087PyObject *
6088PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006090 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 if (!PyUnicode_Check(unicode)) {
6092 PyErr_BadArgument();
6093 return NULL;
6094 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006095 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6096 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006097 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006098}
6099
6100/* --- Raw Unicode Escape Codec ------------------------------------------- */
6101
Alexander Belopolsky40018472011-02-26 01:02:56 +00006102PyObject *
6103PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006104 Py_ssize_t size,
6105 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006107 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006108 Py_ssize_t startinpos;
6109 Py_ssize_t endinpos;
6110 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113 const char *end;
6114 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 PyObject *errorHandler = NULL;
6116 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006117
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 /* Escaped strings will always be longer than the resulting
6119 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 length after conversion to the true value. (But decoding error
6121 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 v = _PyUnicode_New(size);
6123 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006126 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006127 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 end = s + size;
6129 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 unsigned char c;
6131 Py_UCS4 x;
6132 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006133 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 /* Non-escape characters are interpreted as Unicode ordinals */
6136 if (*s != '\\') {
6137 *p++ = (unsigned char)*s++;
6138 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006139 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 startinpos = s-starts;
6141
6142 /* \u-escapes are only interpreted iff the number of leading
6143 backslashes if odd */
6144 bs = s;
6145 for (;s < end;) {
6146 if (*s != '\\')
6147 break;
6148 *p++ = (unsigned char)*s++;
6149 }
6150 if (((s - bs) & 1) == 0 ||
6151 s >= end ||
6152 (*s != 'u' && *s != 'U')) {
6153 continue;
6154 }
6155 p--;
6156 count = *s=='u' ? 4 : 8;
6157 s++;
6158
6159 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6160 outpos = p-PyUnicode_AS_UNICODE(v);
6161 for (x = 0, i = 0; i < count; ++i, ++s) {
6162 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006163 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 endinpos = s-starts;
6165 if (unicode_decode_call_errorhandler(
6166 errors, &errorHandler,
6167 "rawunicodeescape", "truncated \\uXXXX",
6168 &starts, &end, &startinpos, &endinpos, &exc, &s,
6169 &v, &outpos, &p))
6170 goto onError;
6171 goto nextByte;
6172 }
6173 x = (x<<4) & ~0xF;
6174 if (c >= '0' && c <= '9')
6175 x += c - '0';
6176 else if (c >= 'a' && c <= 'f')
6177 x += 10 + c - 'a';
6178 else
6179 x += 10 + c - 'A';
6180 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006181 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 /* UCS-2 character */
6183 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006184 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 /* UCS-4 character. Either store directly, or as
6186 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006187#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006189#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006190 x -= 0x10000L;
6191 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6192 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006193#endif
6194 } else {
6195 endinpos = s-starts;
6196 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006197 if (unicode_decode_call_errorhandler(
6198 errors, &errorHandler,
6199 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 &starts, &end, &startinpos, &endinpos, &exc, &s,
6201 &v, &outpos, &p))
6202 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006203 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 nextByte:
6205 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006207 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006209 Py_XDECREF(errorHandler);
6210 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006211#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006212 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006213 Py_DECREF(v);
6214 return NULL;
6215 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006216#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006217 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006219
Benjamin Peterson29060642009-01-31 22:14:21 +00006220 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006222 Py_XDECREF(errorHandler);
6223 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 return NULL;
6225}
6226
Alexander Belopolsky40018472011-02-26 01:02:56 +00006227PyObject *
6228PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006229 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006231 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 char *p;
6233 char *q;
6234
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006235#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006236 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006237#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006238 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006239#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006240
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006241 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006244 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 if (repr == NULL)
6246 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006247 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006248 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006250 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 while (size-- > 0) {
6252 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006253#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006254 /* Map 32-bit characters to '\Uxxxxxxxx' */
6255 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006256 *p++ = '\\';
6257 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006258 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6259 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6260 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6261 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6262 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6263 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6264 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6265 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006266 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006267 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006268#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6270 if (ch >= 0xD800 && ch < 0xDC00) {
6271 Py_UNICODE ch2;
6272 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006273
Benjamin Peterson29060642009-01-31 22:14:21 +00006274 ch2 = *s++;
6275 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006276 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6278 *p++ = '\\';
6279 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006280 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6281 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6282 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6283 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6284 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6285 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6286 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6287 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 continue;
6289 }
6290 /* Fall through: isolated surrogates are copied as-is */
6291 s--;
6292 size++;
6293 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006294#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006295 /* Map 16-bit characters to '\uxxxx' */
6296 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 *p++ = '\\';
6298 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006299 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6300 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6301 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6302 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006304 /* Copy everything else as-is */
6305 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 *p++ = (char) ch;
6307 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006308 size = p - q;
6309
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006310 assert(size > 0);
6311 if (_PyBytes_Resize(&repr, size) < 0)
6312 return NULL;
6313 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314}
6315
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316PyObject *
6317PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006319 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006321 PyErr_BadArgument();
6322 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006324 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6325 PyUnicode_GET_SIZE(unicode));
6326
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006327 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328}
6329
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330/* --- Unicode Internal Codec ------------------------------------------- */
6331
Alexander Belopolsky40018472011-02-26 01:02:56 +00006332PyObject *
6333_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006334 Py_ssize_t size,
6335 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006336{
6337 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006338 Py_ssize_t startinpos;
6339 Py_ssize_t endinpos;
6340 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006341 PyUnicodeObject *v;
6342 Py_UNICODE *p;
6343 const char *end;
6344 const char *reason;
6345 PyObject *errorHandler = NULL;
6346 PyObject *exc = NULL;
6347
Neal Norwitzd43069c2006-01-08 01:12:10 +00006348#ifdef Py_UNICODE_WIDE
6349 Py_UNICODE unimax = PyUnicode_GetMax();
6350#endif
6351
Thomas Wouters89f507f2006-12-13 04:49:30 +00006352 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006353 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6354 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006355 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006356 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6357 as string was created with the old API. */
6358 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006360 p = PyUnicode_AS_UNICODE(v);
6361 end = s + size;
6362
6363 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006364 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006365 /* We have to sanity check the raw data, otherwise doom looms for
6366 some malformed UCS-4 data. */
6367 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006368#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006369 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006370#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006371 end-s < Py_UNICODE_SIZE
6372 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374 startinpos = s - starts;
6375 if (end-s < Py_UNICODE_SIZE) {
6376 endinpos = end-starts;
6377 reason = "truncated input";
6378 }
6379 else {
6380 endinpos = s - starts + Py_UNICODE_SIZE;
6381 reason = "illegal code point (> 0x10FFFF)";
6382 }
6383 outpos = p - PyUnicode_AS_UNICODE(v);
6384 if (unicode_decode_call_errorhandler(
6385 errors, &errorHandler,
6386 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006387 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006388 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006389 goto onError;
6390 }
6391 }
6392 else {
6393 p++;
6394 s += Py_UNICODE_SIZE;
6395 }
6396 }
6397
Victor Stinnerfe226c02011-10-03 03:52:20 +02006398 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006399 goto onError;
6400 Py_XDECREF(errorHandler);
6401 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006402#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006403 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006404 Py_DECREF(v);
6405 return NULL;
6406 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006407#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006408 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006409 return (PyObject *)v;
6410
Benjamin Peterson29060642009-01-31 22:14:21 +00006411 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 Py_XDECREF(v);
6413 Py_XDECREF(errorHandler);
6414 Py_XDECREF(exc);
6415 return NULL;
6416}
6417
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418/* --- Latin-1 Codec ------------------------------------------------------ */
6419
Alexander Belopolsky40018472011-02-26 01:02:56 +00006420PyObject *
6421PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006422 Py_ssize_t size,
6423 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006426 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006429/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006430static void
6431make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006432 const char *encoding,
6433 const Py_UNICODE *unicode, Py_ssize_t size,
6434 Py_ssize_t startpos, Py_ssize_t endpos,
6435 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006437 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 *exceptionObject = PyUnicodeEncodeError_Create(
6439 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 }
6441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6443 goto onError;
6444 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6445 goto onError;
6446 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6447 goto onError;
6448 return;
6449 onError:
6450 Py_DECREF(*exceptionObject);
6451 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452 }
6453}
6454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006456static void
6457raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006458 const char *encoding,
6459 const Py_UNICODE *unicode, Py_ssize_t size,
6460 Py_ssize_t startpos, Py_ssize_t endpos,
6461 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006462{
6463 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006464 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467}
6468
6469/* error handling callback helper:
6470 build arguments, call the callback and check the arguments,
6471 put the result into newpos and return the replacement string, which
6472 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006473static PyObject *
6474unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006475 PyObject **errorHandler,
6476 const char *encoding, const char *reason,
6477 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6478 Py_ssize_t startpos, Py_ssize_t endpos,
6479 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006480{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006481 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482
6483 PyObject *restuple;
6484 PyObject *resunicode;
6485
6486 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490 }
6491
6492 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006496
6497 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006501 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006502 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 Py_DECREF(restuple);
6504 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006506 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 &resunicode, newpos)) {
6508 Py_DECREF(restuple);
6509 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006511 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6512 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6513 Py_DECREF(restuple);
6514 return NULL;
6515 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006516 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006518 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6520 Py_DECREF(restuple);
6521 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006522 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006523 Py_INCREF(resunicode);
6524 Py_DECREF(restuple);
6525 return resunicode;
6526}
6527
Alexander Belopolsky40018472011-02-26 01:02:56 +00006528static PyObject *
6529unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006530 Py_ssize_t size,
6531 const char *errors,
6532 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006533{
6534 /* output object */
6535 PyObject *res;
6536 /* pointers to the beginning and end+1 of input */
6537 const Py_UNICODE *startp = p;
6538 const Py_UNICODE *endp = p + size;
6539 /* pointer to the beginning of the unencodable characters */
6540 /* const Py_UNICODE *badp = NULL; */
6541 /* pointer into the output */
6542 char *str;
6543 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006544 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006545 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6546 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006547 PyObject *errorHandler = NULL;
6548 PyObject *exc = NULL;
6549 /* the following variable is used for caching string comparisons
6550 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6551 int known_errorHandler = -1;
6552
6553 /* allocate enough for a simple encoding without
6554 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006555 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006556 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006557 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006559 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006560 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561 ressize = size;
6562
6563 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 /* can we encode this? */
6567 if (c<limit) {
6568 /* no overflow check, because we know that the space is enough */
6569 *str++ = (char)c;
6570 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006572 else {
6573 Py_ssize_t unicodepos = p-startp;
6574 Py_ssize_t requiredsize;
6575 PyObject *repunicode;
6576 Py_ssize_t repsize;
6577 Py_ssize_t newpos;
6578 Py_ssize_t respos;
6579 Py_UNICODE *uni2;
6580 /* startpos for collecting unencodable chars */
6581 const Py_UNICODE *collstart = p;
6582 const Py_UNICODE *collend = p;
6583 /* find all unecodable characters */
6584 while ((collend < endp) && ((*collend)>=limit))
6585 ++collend;
6586 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6587 if (known_errorHandler==-1) {
6588 if ((errors==NULL) || (!strcmp(errors, "strict")))
6589 known_errorHandler = 1;
6590 else if (!strcmp(errors, "replace"))
6591 known_errorHandler = 2;
6592 else if (!strcmp(errors, "ignore"))
6593 known_errorHandler = 3;
6594 else if (!strcmp(errors, "xmlcharrefreplace"))
6595 known_errorHandler = 4;
6596 else
6597 known_errorHandler = 0;
6598 }
6599 switch (known_errorHandler) {
6600 case 1: /* strict */
6601 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6602 goto onError;
6603 case 2: /* replace */
6604 while (collstart++<collend)
6605 *str++ = '?'; /* fall through */
6606 case 3: /* ignore */
6607 p = collend;
6608 break;
6609 case 4: /* xmlcharrefreplace */
6610 respos = str - PyBytes_AS_STRING(res);
6611 /* determine replacement size (temporarily (mis)uses p) */
6612 for (p = collstart, repsize = 0; p < collend; ++p) {
6613 if (*p<10)
6614 repsize += 2+1+1;
6615 else if (*p<100)
6616 repsize += 2+2+1;
6617 else if (*p<1000)
6618 repsize += 2+3+1;
6619 else if (*p<10000)
6620 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006621#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006622 else
6623 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006624#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 else if (*p<100000)
6626 repsize += 2+5+1;
6627 else if (*p<1000000)
6628 repsize += 2+6+1;
6629 else
6630 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006631#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006632 }
6633 requiredsize = respos+repsize+(endp-collend);
6634 if (requiredsize > ressize) {
6635 if (requiredsize<2*ressize)
6636 requiredsize = 2*ressize;
6637 if (_PyBytes_Resize(&res, requiredsize))
6638 goto onError;
6639 str = PyBytes_AS_STRING(res) + respos;
6640 ressize = requiredsize;
6641 }
6642 /* generate replacement (temporarily (mis)uses p) */
6643 for (p = collstart; p < collend; ++p) {
6644 str += sprintf(str, "&#%d;", (int)*p);
6645 }
6646 p = collend;
6647 break;
6648 default:
6649 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6650 encoding, reason, startp, size, &exc,
6651 collstart-startp, collend-startp, &newpos);
6652 if (repunicode == NULL)
6653 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006654 if (PyBytes_Check(repunicode)) {
6655 /* Directly copy bytes result to output. */
6656 repsize = PyBytes_Size(repunicode);
6657 if (repsize > 1) {
6658 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006659 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006660 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6661 Py_DECREF(repunicode);
6662 goto onError;
6663 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006664 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006665 ressize += repsize-1;
6666 }
6667 memcpy(str, PyBytes_AsString(repunicode), repsize);
6668 str += repsize;
6669 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006670 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006671 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006673 /* need more space? (at least enough for what we
6674 have+the replacement+the rest of the string, so
6675 we won't have to check space for encodable characters) */
6676 respos = str - PyBytes_AS_STRING(res);
6677 repsize = PyUnicode_GET_SIZE(repunicode);
6678 requiredsize = respos+repsize+(endp-collend);
6679 if (requiredsize > ressize) {
6680 if (requiredsize<2*ressize)
6681 requiredsize = 2*ressize;
6682 if (_PyBytes_Resize(&res, requiredsize)) {
6683 Py_DECREF(repunicode);
6684 goto onError;
6685 }
6686 str = PyBytes_AS_STRING(res) + respos;
6687 ressize = requiredsize;
6688 }
6689 /* check if there is anything unencodable in the replacement
6690 and copy it to the output */
6691 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6692 c = *uni2;
6693 if (c >= limit) {
6694 raise_encode_exception(&exc, encoding, startp, size,
6695 unicodepos, unicodepos+1, reason);
6696 Py_DECREF(repunicode);
6697 goto onError;
6698 }
6699 *str = (char)c;
6700 }
6701 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006702 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006703 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006704 }
6705 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006706 /* Resize if we allocated to much */
6707 size = str - PyBytes_AS_STRING(res);
6708 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006709 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006710 if (_PyBytes_Resize(&res, size) < 0)
6711 goto onError;
6712 }
6713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 Py_XDECREF(errorHandler);
6715 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006716 return res;
6717
6718 onError:
6719 Py_XDECREF(res);
6720 Py_XDECREF(errorHandler);
6721 Py_XDECREF(exc);
6722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006723}
6724
Alexander Belopolsky40018472011-02-26 01:02:56 +00006725PyObject *
6726PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006727 Py_ssize_t size,
6728 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006731}
6732
Alexander Belopolsky40018472011-02-26 01:02:56 +00006733PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006734_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735{
6736 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 PyErr_BadArgument();
6738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006740 if (PyUnicode_READY(unicode) == -1)
6741 return NULL;
6742 /* Fast path: if it is a one-byte string, construct
6743 bytes object directly. */
6744 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6745 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6746 PyUnicode_GET_LENGTH(unicode));
6747 /* Non-Latin-1 characters present. Defer to above function to
6748 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006751 errors);
6752}
6753
6754PyObject*
6755PyUnicode_AsLatin1String(PyObject *unicode)
6756{
6757 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
6760/* --- 7-bit ASCII Codec -------------------------------------------------- */
6761
Alexander Belopolsky40018472011-02-26 01:02:56 +00006762PyObject *
6763PyUnicode_DecodeASCII(const char *s,
6764 Py_ssize_t size,
6765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006769 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006770 Py_ssize_t startinpos;
6771 Py_ssize_t endinpos;
6772 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006774 int has_error;
6775 const unsigned char *p = (const unsigned char *)s;
6776 const unsigned char *end = p + size;
6777 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778 PyObject *errorHandler = NULL;
6779 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006780
Guido van Rossumd57fd912000-03-10 22:53:23 +00006781 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006782 if (size == 1 && (unsigned char)s[0] < 128)
6783 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006784
Victor Stinner702c7342011-10-05 13:50:52 +02006785 has_error = 0;
6786 while (p < end && !has_error) {
6787 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6788 an explanation. */
6789 if (!((size_t) p & LONG_PTR_MASK)) {
6790 /* Help register allocation */
6791 register const unsigned char *_p = p;
6792 while (_p < aligned_end) {
6793 unsigned long value = *(unsigned long *) _p;
6794 if (value & ASCII_CHAR_MASK) {
6795 has_error = 1;
6796 break;
6797 }
6798 _p += SIZEOF_LONG;
6799 }
6800 if (_p == end)
6801 break;
6802 if (has_error)
6803 break;
6804 p = _p;
6805 }
6806 if (*p & 0x80) {
6807 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006808 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006809 }
6810 else {
6811 ++p;
6812 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006813 }
Victor Stinner702c7342011-10-05 13:50:52 +02006814 if (!has_error)
6815 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006816
Guido van Rossumd57fd912000-03-10 22:53:23 +00006817 v = _PyUnicode_New(size);
6818 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006822 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006823 e = s + size;
6824 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 register unsigned char c = (unsigned char)*s;
6826 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006827 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 ++s;
6829 }
6830 else {
6831 startinpos = s-starts;
6832 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006833 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 if (unicode_decode_call_errorhandler(
6835 errors, &errorHandler,
6836 "ascii", "ordinal not in range(128)",
6837 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006838 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 goto onError;
6840 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006841 }
Victor Stinner702c7342011-10-05 13:50:52 +02006842 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6843 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006844 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006845 Py_XDECREF(errorHandler);
6846 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006847#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006848 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006849 Py_DECREF(v);
6850 return NULL;
6851 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006852#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006853 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006855
Benjamin Peterson29060642009-01-31 22:14:21 +00006856 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006858 Py_XDECREF(errorHandler);
6859 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 return NULL;
6861}
6862
Alexander Belopolsky40018472011-02-26 01:02:56 +00006863PyObject *
6864PyUnicode_EncodeASCII(const Py_UNICODE *p,
6865 Py_ssize_t size,
6866 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869}
6870
Alexander Belopolsky40018472011-02-26 01:02:56 +00006871PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006872_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873{
6874 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006875 PyErr_BadArgument();
6876 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006878 if (PyUnicode_READY(unicode) == -1)
6879 return NULL;
6880 /* Fast path: if it is an ASCII-only string, construct bytes object
6881 directly. Else defer to above function to raise the exception. */
6882 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6883 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6884 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006887 errors);
6888}
6889
6890PyObject *
6891PyUnicode_AsASCIIString(PyObject *unicode)
6892{
6893 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006894}
6895
Victor Stinner99b95382011-07-04 14:23:54 +02006896#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006897
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006898/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006899
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006900#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006901#define NEED_RETRY
6902#endif
6903
Victor Stinner3a50e702011-10-18 21:21:00 +02006904#ifndef WC_ERR_INVALID_CHARS
6905# define WC_ERR_INVALID_CHARS 0x0080
6906#endif
6907
6908static char*
6909code_page_name(UINT code_page, PyObject **obj)
6910{
6911 *obj = NULL;
6912 if (code_page == CP_ACP)
6913 return "mbcs";
6914 if (code_page == CP_UTF7)
6915 return "CP_UTF7";
6916 if (code_page == CP_UTF8)
6917 return "CP_UTF8";
6918
6919 *obj = PyBytes_FromFormat("cp%u", code_page);
6920 if (*obj == NULL)
6921 return NULL;
6922 return PyBytes_AS_STRING(*obj);
6923}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006924
Alexander Belopolsky40018472011-02-26 01:02:56 +00006925static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006926is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006927{
6928 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006929 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930
Victor Stinner3a50e702011-10-18 21:21:00 +02006931 if (!IsDBCSLeadByteEx(code_page, *curr))
6932 return 0;
6933
6934 prev = CharPrevExA(code_page, s, curr, 0);
6935 if (prev == curr)
6936 return 1;
6937 /* FIXME: This code is limited to "true" double-byte encodings,
6938 as it assumes an incomplete character consists of a single
6939 byte. */
6940 if (curr - prev == 2)
6941 return 1;
6942 if (!IsDBCSLeadByteEx(code_page, *prev))
6943 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006944 return 0;
6945}
6946
Victor Stinner3a50e702011-10-18 21:21:00 +02006947static DWORD
6948decode_code_page_flags(UINT code_page)
6949{
6950 if (code_page == CP_UTF7) {
6951 /* The CP_UTF7 decoder only supports flags=0 */
6952 return 0;
6953 }
6954 else
6955 return MB_ERR_INVALID_CHARS;
6956}
6957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 * Decode a byte string from a Windows code page into unicode object in strict
6960 * mode.
6961 *
6962 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6963 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006964 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006965static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006966decode_code_page_strict(UINT code_page,
6967 PyUnicodeObject **v,
6968 const char *in,
6969 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006970{
Victor Stinner3a50e702011-10-18 21:21:00 +02006971 const DWORD flags = decode_code_page_flags(code_page);
6972 Py_UNICODE *out;
6973 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974
6975 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006976 assert(insize > 0);
6977 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6978 if (outsize <= 0)
6979 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006980
6981 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 /* Create unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006983 *v = _PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006984 if (*v == NULL)
6985 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006986 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006987 }
6988 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006989 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6991 if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006993 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006994 }
6995
6996 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6998 if (outsize <= 0)
6999 goto error;
7000 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007001
Victor Stinner3a50e702011-10-18 21:21:00 +02007002error:
7003 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7004 return -2;
7005 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007006 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007}
7008
Victor Stinner3a50e702011-10-18 21:21:00 +02007009/*
7010 * Decode a byte string from a code page into unicode object with an error
7011 * handler.
7012 *
7013 * Returns consumed size if succeed, or raise a WindowsError or
7014 * UnicodeDecodeError exception and returns -1 on error.
7015 */
7016static int
7017decode_code_page_errors(UINT code_page,
7018 PyUnicodeObject **v,
7019 const char *in,
7020 int size,
7021 const char *errors)
7022{
7023 const char *startin = in;
7024 const char *endin = in + size;
7025 const DWORD flags = decode_code_page_flags(code_page);
7026 /* Ideally, we should get reason from FormatMessage. This is the Windows
7027 2000 English version of the message. */
7028 const char *reason = "No mapping for the Unicode character exists "
7029 "in the target code page.";
7030 /* each step cannot decode more than 1 character, but a character can be
7031 represented as a surrogate pair */
7032 wchar_t buffer[2], *startout, *out;
7033 int insize, outsize;
7034 PyObject *errorHandler = NULL;
7035 PyObject *exc = NULL;
7036 PyObject *encoding_obj = NULL;
7037 char *encoding;
7038 DWORD err;
7039 int ret = -1;
7040
7041 assert(size > 0);
7042
7043 encoding = code_page_name(code_page, &encoding_obj);
7044 if (encoding == NULL)
7045 return -1;
7046
7047 if (errors == NULL || strcmp(errors, "strict") == 0) {
7048 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7049 UnicodeDecodeError. */
7050 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7051 if (exc != NULL) {
7052 PyCodec_StrictErrors(exc);
7053 Py_CLEAR(exc);
7054 }
7055 goto error;
7056 }
7057
7058 if (*v == NULL) {
7059 /* Create unicode object */
7060 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7061 PyErr_NoMemory();
7062 goto error;
7063 }
7064 *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7065 if (*v == NULL)
7066 goto error;
7067 startout = PyUnicode_AS_UNICODE(*v);
7068 }
7069 else {
7070 /* Extend unicode object */
7071 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7072 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7073 PyErr_NoMemory();
7074 goto error;
7075 }
7076 if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7077 goto error;
7078 startout = PyUnicode_AS_UNICODE(*v) + n;
7079 }
7080
7081 /* Decode the byte string character per character */
7082 out = startout;
7083 while (in < endin)
7084 {
7085 /* Decode a character */
7086 insize = 1;
7087 do
7088 {
7089 outsize = MultiByteToWideChar(code_page, flags,
7090 in, insize,
7091 buffer, Py_ARRAY_LENGTH(buffer));
7092 if (outsize > 0)
7093 break;
7094 err = GetLastError();
7095 if (err != ERROR_NO_UNICODE_TRANSLATION
7096 && err != ERROR_INSUFFICIENT_BUFFER)
7097 {
7098 PyErr_SetFromWindowsErr(0);
7099 goto error;
7100 }
7101 insize++;
7102 }
7103 /* 4=maximum length of a UTF-8 sequence */
7104 while (insize <= 4 && (in + insize) <= endin);
7105
7106 if (outsize <= 0) {
7107 Py_ssize_t startinpos, endinpos, outpos;
7108
7109 startinpos = in - startin;
7110 endinpos = startinpos + 1;
7111 outpos = out - PyUnicode_AS_UNICODE(*v);
7112 if (unicode_decode_call_errorhandler(
7113 errors, &errorHandler,
7114 encoding, reason,
7115 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7116 v, &outpos, &out))
7117 {
7118 goto error;
7119 }
7120 }
7121 else {
7122 in += insize;
7123 memcpy(out, buffer, outsize * sizeof(wchar_t));
7124 out += outsize;
7125 }
7126 }
7127
7128 /* write a NUL character at the end */
7129 *out = 0;
7130
7131 /* Extend unicode object */
7132 outsize = out - startout;
7133 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7134 if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7135 goto error;
7136 ret = 0;
7137
7138error:
7139 Py_XDECREF(encoding_obj);
7140 Py_XDECREF(errorHandler);
7141 Py_XDECREF(exc);
7142 return ret;
7143}
7144
7145/*
7146 * Decode a byte string from a Windows code page into unicode object. If
7147 * 'final' is set, converts trailing lead-byte too.
7148 *
7149 * Returns consumed size if succeed, or raise a WindowsError or
7150 * UnicodeDecodeError exception and returns -1 on error.
7151 */
7152static int
7153decode_code_page(UINT code_page,
7154 PyUnicodeObject **v,
7155 const char *s, int size,
7156 int final, const char *errors)
7157{
7158 int done;
7159
7160 /* Skip trailing lead-byte unless 'final' is set */
7161 if (size == 0) {
7162 if (*v == NULL) {
7163 Py_INCREF(unicode_empty);
7164 *v = (PyUnicodeObject*)unicode_empty;
7165 if (*v == NULL)
7166 return -1;
7167 }
7168 return 0;
7169 }
7170
7171 if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7172 --size;
7173
7174 done = decode_code_page_strict(code_page, v, s, size);
7175 if (done == -2)
7176 done = decode_code_page_errors(code_page, v, s, size, errors);
7177 return done;
7178}
7179
7180static PyObject *
7181decode_code_page_stateful(int code_page,
7182 const char *s,
7183 Py_ssize_t size,
7184 const char *errors,
7185 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007186{
7187 PyUnicodeObject *v = NULL;
7188 int done;
7189
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 if (code_page < 0) {
7191 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7192 return NULL;
7193 }
7194
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007195 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197
7198#ifdef NEED_RETRY
7199 retry:
7200 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007201 done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007202 else
7203#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007205
7206 if (done < 0) {
7207 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209 }
7210
7211 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213
7214#ifdef NEED_RETRY
7215 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 s += done;
7217 size -= done;
7218 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007219 }
7220#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007221
Victor Stinner17efeed2011-10-04 20:05:46 +02007222#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007223 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007224 Py_DECREF(v);
7225 return NULL;
7226 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007227#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007228 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007229 return (PyObject *)v;
7230}
7231
Alexander Belopolsky40018472011-02-26 01:02:56 +00007232PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007233PyUnicode_DecodeCodePageStateful(int code_page,
7234 const char *s,
7235 Py_ssize_t size,
7236 const char *errors,
7237 Py_ssize_t *consumed)
7238{
7239 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7240}
7241
7242PyObject *
7243PyUnicode_DecodeMBCSStateful(const char *s,
7244 Py_ssize_t size,
7245 const char *errors,
7246 Py_ssize_t *consumed)
7247{
7248 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7249}
7250
7251PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007252PyUnicode_DecodeMBCS(const char *s,
7253 Py_ssize_t size,
7254 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007255{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007256 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7257}
7258
Victor Stinner3a50e702011-10-18 21:21:00 +02007259static DWORD
7260encode_code_page_flags(UINT code_page, const char *errors)
7261{
7262 if (code_page == CP_UTF8) {
7263 if (winver.dwMajorVersion >= 6)
7264 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7265 and later */
7266 return WC_ERR_INVALID_CHARS;
7267 else
7268 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7269 return 0;
7270 }
7271 else if (code_page == CP_UTF7) {
7272 /* CP_UTF7 only supports flags=0 */
7273 return 0;
7274 }
7275 else {
7276 if (errors != NULL && strcmp(errors, "replace") == 0)
7277 return 0;
7278 else
7279 return WC_NO_BEST_FIT_CHARS;
7280 }
7281}
7282
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007283/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 * Encode a Unicode string to a Windows code page into a byte string in strict
7285 * mode.
7286 *
7287 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7288 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007289 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007290static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007291encode_code_page_strict(UINT code_page, PyObject **outbytes,
7292 const Py_UNICODE *p, const int size,
7293 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007294{
Victor Stinner554f3f02010-06-16 23:33:54 +00007295 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 BOOL *pusedDefaultChar = &usedDefaultChar;
7297 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007298 PyObject *exc = NULL;
Victor Stinner3a50e702011-10-18 21:21:00 +02007299 const DWORD flags = encode_code_page_flags(code_page, NULL);
7300 char *out;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007301
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 assert(size > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007303
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007305 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007307 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007308
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007309 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 outsize = WideCharToMultiByte(code_page, flags,
7311 p, size,
7312 NULL, 0,
7313 NULL, pusedDefaultChar);
7314 if (outsize <= 0)
7315 goto error;
7316 /* If we used a default char, then we failed! */
7317 if (pusedDefaultChar && *pusedDefaultChar)
7318 return -2;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007319
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007321 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7323 if (*outbytes == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007324 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007325 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007326 }
7327 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 const Py_ssize_t n = PyBytes_Size(*outbytes);
7330 if (outsize > PY_SSIZE_T_MAX - n) {
7331 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 }
7334 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7335 return -1;
7336 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337 }
7338
7339 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007340 outsize = WideCharToMultiByte(code_page, flags,
7341 p, size,
7342 out, outsize,
7343 NULL, pusedDefaultChar);
7344 if (outsize <= 0)
7345 goto error;
7346 if (pusedDefaultChar && *pusedDefaultChar)
7347 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007349
Victor Stinner3a50e702011-10-18 21:21:00 +02007350error:
7351 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7352 return -2;
7353 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007354 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007355}
7356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357/*
7358 * Encode a Unicode string to a Windows code page into a byte string using a
7359 * error handler.
7360 *
7361 * Returns consumed characters if succeed, or raise a WindowsError and returns
7362 * -1 on other error.
7363 */
7364static int
7365encode_code_page_errors(UINT code_page, PyObject **outbytes,
7366 const Py_UNICODE *in, const int insize,
7367 const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007368{
Victor Stinner3a50e702011-10-18 21:21:00 +02007369 const DWORD flags = encode_code_page_flags(code_page, errors);
7370 const Py_UNICODE *startin = in;
7371 const Py_UNICODE *endin = in + insize;
7372 /* Ideally, we should get reason from FormatMessage. This is the Windows
7373 2000 English version of the message. */
7374 const char *reason = "invalid character";
7375 /* 4=maximum length of a UTF-8 sequence */
7376 char buffer[4];
7377 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7378 Py_ssize_t outsize;
7379 char *out;
7380 int charsize;
7381 PyObject *errorHandler = NULL;
7382 PyObject *exc = NULL;
7383 PyObject *encoding_obj = NULL;
7384 char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 Py_ssize_t startpos, newpos, newoutsize;
7386 PyObject *rep;
7387 int ret = -1;
7388
7389 assert(insize > 0);
7390
7391 encoding = code_page_name(code_page, &encoding_obj);
7392 if (encoding == NULL)
7393 return -1;
7394
7395 if (errors == NULL || strcmp(errors, "strict") == 0) {
7396 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7397 then we raise a UnicodeEncodeError. */
7398 make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
7399 if (exc != NULL) {
7400 PyCodec_StrictErrors(exc);
7401 Py_DECREF(exc);
7402 }
7403 Py_XDECREF(encoding_obj);
7404 return -1;
7405 }
7406
7407 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7408 pusedDefaultChar = &usedDefaultChar;
7409 else
7410 pusedDefaultChar = NULL;
7411
7412 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7413 PyErr_NoMemory();
7414 goto error;
7415 }
7416 outsize = insize * Py_ARRAY_LENGTH(buffer);
7417
7418 if (*outbytes == NULL) {
7419 /* Create string object */
7420 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7421 if (*outbytes == NULL)
7422 goto error;
7423 out = PyBytes_AS_STRING(*outbytes);
7424 }
7425 else {
7426 /* Extend string object */
7427 Py_ssize_t n = PyBytes_Size(*outbytes);
7428 if (n > PY_SSIZE_T_MAX - outsize) {
7429 PyErr_NoMemory();
7430 goto error;
7431 }
7432 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7433 goto error;
7434 out = PyBytes_AS_STRING(*outbytes) + n;
7435 }
7436
7437 /* Encode the string character per character */
7438 while (in < endin)
7439 {
7440 if ((in + 2) <= endin
7441 && 0xD800 <= in[0] && in[0] <= 0xDBFF
7442 && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
7443 charsize = 2;
7444 else
7445 charsize = 1;
7446
7447 outsize = WideCharToMultiByte(code_page, flags,
7448 in, charsize,
7449 buffer, Py_ARRAY_LENGTH(buffer),
7450 NULL, pusedDefaultChar);
7451 if (outsize > 0) {
7452 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7453 {
7454 in += charsize;
7455 memcpy(out, buffer, outsize);
7456 out += outsize;
7457 continue;
7458 }
7459 }
7460 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7461 PyErr_SetFromWindowsErr(0);
7462 goto error;
7463 }
7464
7465 charsize = Py_MAX(charsize - 1, 1);
7466 startpos = in - startin;
7467 rep = unicode_encode_call_errorhandler(
7468 errors, &errorHandler, encoding, reason,
7469 startin, insize, &exc,
7470 startpos, startpos + charsize, &newpos);
7471 if (rep == NULL)
7472 goto error;
7473 in = startin + newpos;
7474
7475 if (PyBytes_Check(rep)) {
7476 outsize = PyBytes_GET_SIZE(rep);
7477 if (outsize != 1) {
7478 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7479 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7480 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7481 Py_DECREF(rep);
7482 goto error;
7483 }
7484 out = PyBytes_AS_STRING(*outbytes) + offset;
7485 }
7486 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7487 out += outsize;
7488 }
7489 else {
7490 Py_ssize_t i;
7491 enum PyUnicode_Kind kind;
7492 void *data;
7493
7494 if (PyUnicode_READY(rep) < 0) {
7495 Py_DECREF(rep);
7496 goto error;
7497 }
7498
7499 outsize = PyUnicode_GET_LENGTH(rep);
7500 if (outsize != 1) {
7501 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7502 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7503 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7504 Py_DECREF(rep);
7505 goto error;
7506 }
7507 out = PyBytes_AS_STRING(*outbytes) + offset;
7508 }
7509 kind = PyUnicode_KIND(rep);
7510 data = PyUnicode_DATA(rep);
7511 for (i=0; i < outsize; i++) {
7512 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7513 if (ch > 127) {
7514 raise_encode_exception(&exc,
7515 encoding,
7516 startin, insize,
7517 startpos, startpos + charsize,
7518 "unable to encode error handler result to ASCII");
7519 Py_DECREF(rep);
7520 goto error;
7521 }
7522 *out = (unsigned char)ch;
7523 out++;
7524 }
7525 }
7526 Py_DECREF(rep);
7527 }
7528 /* write a NUL byte */
7529 *out = 0;
7530 outsize = out - PyBytes_AS_STRING(*outbytes);
7531 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7532 if (_PyBytes_Resize(outbytes, outsize) < 0)
7533 goto error;
7534 ret = 0;
7535
7536error:
7537 Py_XDECREF(encoding_obj);
7538 Py_XDECREF(errorHandler);
7539 Py_XDECREF(exc);
7540 return ret;
7541}
7542
7543/*
7544 * Encode a Unicode string to a Windows code page into a byte string.
7545 *
7546 * Returns consumed characters if succeed, or raise a WindowsError and returns
7547 * -1 on other error.
7548 */
7549static int
7550encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7551 const Py_UNICODE *p, int size,
7552 const char* errors)
7553{
7554 int done;
7555
7556 if (size == 0) {
7557 if (*outbytes == NULL) {
7558 *outbytes = PyBytes_FromStringAndSize(NULL, 0);
7559 if (*outbytes == NULL)
7560 return -1;
7561 }
7562 return 0;
7563 }
7564
7565 done = encode_code_page_strict(code_page, outbytes, p, size, errors);
7566 if (done == -2)
7567 done = encode_code_page_errors(code_page, outbytes, p, size, errors);
7568 return done;
7569}
7570
7571static PyObject *
7572encode_code_page(int code_page,
7573 const Py_UNICODE *p, Py_ssize_t size,
7574 const char *errors)
7575{
7576 PyObject *outbytes = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007577 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007578
Victor Stinner3a50e702011-10-18 21:21:00 +02007579 if (code_page < 0) {
7580 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7581 return NULL;
7582 }
7583
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007584#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007586 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588 else
7589#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007591
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592 if (ret < 0) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 Py_XDECREF(outbytes);
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007595 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596
7597#ifdef NEED_RETRY
7598 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 p += INT_MAX;
7600 size -= INT_MAX;
7601 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007602 }
7603#endif
7604
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 return outbytes;
7606}
7607
7608PyObject *
7609PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7610 Py_ssize_t size,
7611 const char *errors)
7612{
7613 return encode_code_page(CP_ACP, p, size, errors);
7614}
7615
7616PyObject *
7617PyUnicode_EncodeCodePage(int code_page,
7618 PyObject *unicode,
7619 const char *errors)
7620{
7621 const Py_UNICODE *p;
7622 Py_ssize_t size;
7623 p = PyUnicode_AsUnicodeAndSize(unicode, &size);
7624 if (p == NULL)
7625 return NULL;
7626 return encode_code_page(code_page, p, size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007627}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007628
Alexander Belopolsky40018472011-02-26 01:02:56 +00007629PyObject *
7630PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007631{
7632 if (!PyUnicode_Check(unicode)) {
7633 PyErr_BadArgument();
7634 return NULL;
7635 }
7636 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007637 PyUnicode_GET_SIZE(unicode),
7638 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007639}
7640
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007641#undef NEED_RETRY
7642
Victor Stinner99b95382011-07-04 14:23:54 +02007643#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007644
Guido van Rossumd57fd912000-03-10 22:53:23 +00007645/* --- Character Mapping Codec -------------------------------------------- */
7646
Alexander Belopolsky40018472011-02-26 01:02:56 +00007647PyObject *
7648PyUnicode_DecodeCharmap(const char *s,
7649 Py_ssize_t size,
7650 PyObject *mapping,
7651 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007654 Py_ssize_t startinpos;
7655 Py_ssize_t endinpos;
7656 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007657 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007658 PyUnicodeObject *v;
7659 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007660 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 PyObject *errorHandler = NULL;
7662 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007663 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007664 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007665
Guido van Rossumd57fd912000-03-10 22:53:23 +00007666 /* Default to Latin-1 */
7667 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007668 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007669
7670 v = _PyUnicode_New(size);
7671 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007675 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007676 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007677 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 mapstring = PyUnicode_AS_UNICODE(mapping);
7679 maplen = PyUnicode_GET_SIZE(mapping);
7680 while (s < e) {
7681 unsigned char ch = *s;
7682 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 if (ch < maplen)
7685 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007686
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 if (x == 0xfffe) {
7688 /* undefined mapping */
7689 outpos = p-PyUnicode_AS_UNICODE(v);
7690 startinpos = s-starts;
7691 endinpos = startinpos+1;
7692 if (unicode_decode_call_errorhandler(
7693 errors, &errorHandler,
7694 "charmap", "character maps to <undefined>",
7695 &starts, &e, &startinpos, &endinpos, &exc, &s,
7696 &v, &outpos, &p)) {
7697 goto onError;
7698 }
7699 continue;
7700 }
7701 *p++ = x;
7702 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007703 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007704 }
7705 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007706 while (s < e) {
7707 unsigned char ch = *s;
7708 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007709
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7711 w = PyLong_FromLong((long)ch);
7712 if (w == NULL)
7713 goto onError;
7714 x = PyObject_GetItem(mapping, w);
7715 Py_DECREF(w);
7716 if (x == NULL) {
7717 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7718 /* No mapping found means: mapping is undefined. */
7719 PyErr_Clear();
7720 x = Py_None;
7721 Py_INCREF(x);
7722 } else
7723 goto onError;
7724 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007725
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 /* Apply mapping */
7727 if (PyLong_Check(x)) {
7728 long value = PyLong_AS_LONG(x);
7729 if (value < 0 || value > 65535) {
7730 PyErr_SetString(PyExc_TypeError,
7731 "character mapping must be in range(65536)");
7732 Py_DECREF(x);
7733 goto onError;
7734 }
7735 *p++ = (Py_UNICODE)value;
7736 }
7737 else if (x == Py_None) {
7738 /* undefined mapping */
7739 outpos = p-PyUnicode_AS_UNICODE(v);
7740 startinpos = s-starts;
7741 endinpos = startinpos+1;
7742 if (unicode_decode_call_errorhandler(
7743 errors, &errorHandler,
7744 "charmap", "character maps to <undefined>",
7745 &starts, &e, &startinpos, &endinpos, &exc, &s,
7746 &v, &outpos, &p)) {
7747 Py_DECREF(x);
7748 goto onError;
7749 }
7750 Py_DECREF(x);
7751 continue;
7752 }
7753 else if (PyUnicode_Check(x)) {
7754 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 if (targetsize == 1)
7757 /* 1-1 mapping */
7758 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 else if (targetsize > 1) {
7761 /* 1-n mapping */
7762 if (targetsize > extrachars) {
7763 /* resize first */
7764 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7765 Py_ssize_t needed = (targetsize - extrachars) + \
7766 (targetsize << 2);
7767 extrachars += needed;
7768 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007769 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 PyUnicode_GET_SIZE(v) + needed) < 0) {
7771 Py_DECREF(x);
7772 goto onError;
7773 }
7774 p = PyUnicode_AS_UNICODE(v) + oldpos;
7775 }
7776 Py_UNICODE_COPY(p,
7777 PyUnicode_AS_UNICODE(x),
7778 targetsize);
7779 p += targetsize;
7780 extrachars -= targetsize;
7781 }
7782 /* 1-0 mapping: skip the character */
7783 }
7784 else {
7785 /* wrong return value */
7786 PyErr_SetString(PyExc_TypeError,
7787 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007788 Py_DECREF(x);
7789 goto onError;
7790 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 Py_DECREF(x);
7792 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007794 }
7795 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007796 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007798 Py_XDECREF(errorHandler);
7799 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007800#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007801 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007802 Py_DECREF(v);
7803 return NULL;
7804 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007805#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007806 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007807 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007808
Benjamin Peterson29060642009-01-31 22:14:21 +00007809 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007810 Py_XDECREF(errorHandler);
7811 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007812 Py_XDECREF(v);
7813 return NULL;
7814}
7815
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007816/* Charmap encoding: the lookup table */
7817
Alexander Belopolsky40018472011-02-26 01:02:56 +00007818struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 PyObject_HEAD
7820 unsigned char level1[32];
7821 int count2, count3;
7822 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823};
7824
7825static PyObject*
7826encoding_map_size(PyObject *obj, PyObject* args)
7827{
7828 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007829 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007830 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007831}
7832
7833static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007834 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007835 PyDoc_STR("Return the size (in bytes) of this object") },
7836 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837};
7838
7839static void
7840encoding_map_dealloc(PyObject* o)
7841{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007842 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843}
7844
7845static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007846 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007847 "EncodingMap", /*tp_name*/
7848 sizeof(struct encoding_map), /*tp_basicsize*/
7849 0, /*tp_itemsize*/
7850 /* methods */
7851 encoding_map_dealloc, /*tp_dealloc*/
7852 0, /*tp_print*/
7853 0, /*tp_getattr*/
7854 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007855 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007856 0, /*tp_repr*/
7857 0, /*tp_as_number*/
7858 0, /*tp_as_sequence*/
7859 0, /*tp_as_mapping*/
7860 0, /*tp_hash*/
7861 0, /*tp_call*/
7862 0, /*tp_str*/
7863 0, /*tp_getattro*/
7864 0, /*tp_setattro*/
7865 0, /*tp_as_buffer*/
7866 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7867 0, /*tp_doc*/
7868 0, /*tp_traverse*/
7869 0, /*tp_clear*/
7870 0, /*tp_richcompare*/
7871 0, /*tp_weaklistoffset*/
7872 0, /*tp_iter*/
7873 0, /*tp_iternext*/
7874 encoding_map_methods, /*tp_methods*/
7875 0, /*tp_members*/
7876 0, /*tp_getset*/
7877 0, /*tp_base*/
7878 0, /*tp_dict*/
7879 0, /*tp_descr_get*/
7880 0, /*tp_descr_set*/
7881 0, /*tp_dictoffset*/
7882 0, /*tp_init*/
7883 0, /*tp_alloc*/
7884 0, /*tp_new*/
7885 0, /*tp_free*/
7886 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007887};
7888
7889PyObject*
7890PyUnicode_BuildEncodingMap(PyObject* string)
7891{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007892 PyObject *result;
7893 struct encoding_map *mresult;
7894 int i;
7895 int need_dict = 0;
7896 unsigned char level1[32];
7897 unsigned char level2[512];
7898 unsigned char *mlevel1, *mlevel2, *mlevel3;
7899 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007900 int kind;
7901 void *data;
7902 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007905 PyErr_BadArgument();
7906 return NULL;
7907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 kind = PyUnicode_KIND(string);
7909 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 memset(level1, 0xFF, sizeof level1);
7911 memset(level2, 0xFF, sizeof level2);
7912
7913 /* If there isn't a one-to-one mapping of NULL to \0,
7914 or if there are non-BMP characters, we need to use
7915 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007916 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007917 need_dict = 1;
7918 for (i = 1; i < 256; i++) {
7919 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 ch = PyUnicode_READ(kind, data, i);
7921 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922 need_dict = 1;
7923 break;
7924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007925 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 /* unmapped character */
7927 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007928 l1 = ch >> 11;
7929 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930 if (level1[l1] == 0xFF)
7931 level1[l1] = count2++;
7932 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007933 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 }
7935
7936 if (count2 >= 0xFF || count3 >= 0xFF)
7937 need_dict = 1;
7938
7939 if (need_dict) {
7940 PyObject *result = PyDict_New();
7941 PyObject *key, *value;
7942 if (!result)
7943 return NULL;
7944 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007945 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007946 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007947 if (!key || !value)
7948 goto failed1;
7949 if (PyDict_SetItem(result, key, value) == -1)
7950 goto failed1;
7951 Py_DECREF(key);
7952 Py_DECREF(value);
7953 }
7954 return result;
7955 failed1:
7956 Py_XDECREF(key);
7957 Py_XDECREF(value);
7958 Py_DECREF(result);
7959 return NULL;
7960 }
7961
7962 /* Create a three-level trie */
7963 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7964 16*count2 + 128*count3 - 1);
7965 if (!result)
7966 return PyErr_NoMemory();
7967 PyObject_Init(result, &EncodingMapType);
7968 mresult = (struct encoding_map*)result;
7969 mresult->count2 = count2;
7970 mresult->count3 = count3;
7971 mlevel1 = mresult->level1;
7972 mlevel2 = mresult->level23;
7973 mlevel3 = mresult->level23 + 16*count2;
7974 memcpy(mlevel1, level1, 32);
7975 memset(mlevel2, 0xFF, 16*count2);
7976 memset(mlevel3, 0, 128*count3);
7977 count3 = 0;
7978 for (i = 1; i < 256; i++) {
7979 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007980 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007981 /* unmapped character */
7982 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007983 o1 = PyUnicode_READ(kind, data, i)>>11;
7984 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985 i2 = 16*mlevel1[o1] + o2;
7986 if (mlevel2[i2] == 0xFF)
7987 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007988 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007989 i3 = 128*mlevel2[i2] + o3;
7990 mlevel3[i3] = i;
7991 }
7992 return result;
7993}
7994
7995static int
7996encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7997{
7998 struct encoding_map *map = (struct encoding_map*)mapping;
7999 int l1 = c>>11;
8000 int l2 = (c>>7) & 0xF;
8001 int l3 = c & 0x7F;
8002 int i;
8003
8004#ifdef Py_UNICODE_WIDE
8005 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008006 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007 }
8008#endif
8009 if (c == 0)
8010 return 0;
8011 /* level 1*/
8012 i = map->level1[l1];
8013 if (i == 0xFF) {
8014 return -1;
8015 }
8016 /* level 2*/
8017 i = map->level23[16*i+l2];
8018 if (i == 0xFF) {
8019 return -1;
8020 }
8021 /* level 3 */
8022 i = map->level23[16*map->count2 + 128*i + l3];
8023 if (i == 0) {
8024 return -1;
8025 }
8026 return i;
8027}
8028
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008029/* Lookup the character ch in the mapping. If the character
8030 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008031 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008032static PyObject *
8033charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034{
Christian Heimes217cfd12007-12-02 14:31:20 +00008035 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008036 PyObject *x;
8037
8038 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040 x = PyObject_GetItem(mapping, w);
8041 Py_DECREF(w);
8042 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8044 /* No mapping found means: mapping is undefined. */
8045 PyErr_Clear();
8046 x = Py_None;
8047 Py_INCREF(x);
8048 return x;
8049 } else
8050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008051 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008052 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008054 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 long value = PyLong_AS_LONG(x);
8056 if (value < 0 || value > 255) {
8057 PyErr_SetString(PyExc_TypeError,
8058 "character mapping must be in range(256)");
8059 Py_DECREF(x);
8060 return NULL;
8061 }
8062 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008063 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008065 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 /* wrong return value */
8068 PyErr_Format(PyExc_TypeError,
8069 "character mapping must return integer, bytes or None, not %.400s",
8070 x->ob_type->tp_name);
8071 Py_DECREF(x);
8072 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 }
8074}
8075
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008077charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008078{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008079 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8080 /* exponentially overallocate to minimize reallocations */
8081 if (requiredsize < 2*outsize)
8082 requiredsize = 2*outsize;
8083 if (_PyBytes_Resize(outobj, requiredsize))
8084 return -1;
8085 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086}
8087
Benjamin Peterson14339b62009-01-31 16:36:08 +00008088typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008090} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008092 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093 space is available. Return a new reference to the object that
8094 was put in the output buffer, or Py_None, if the mapping was undefined
8095 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008096 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008097static charmapencode_result
8098charmapencode_output(Py_UNICODE c, PyObject *mapping,
8099 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 PyObject *rep;
8102 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008103 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104
Christian Heimes90aa7642007-12-19 02:45:37 +00008105 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108 if (res == -1)
8109 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 if (outsize<requiredsize)
8111 if (charmapencode_resize(outobj, outpos, requiredsize))
8112 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008113 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 outstart[(*outpos)++] = (char)res;
8115 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116 }
8117
8118 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008121 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008122 Py_DECREF(rep);
8123 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008124 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 if (PyLong_Check(rep)) {
8126 Py_ssize_t requiredsize = *outpos+1;
8127 if (outsize<requiredsize)
8128 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8129 Py_DECREF(rep);
8130 return enc_EXCEPTION;
8131 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008132 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008133 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008134 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008135 else {
8136 const char *repchars = PyBytes_AS_STRING(rep);
8137 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8138 Py_ssize_t requiredsize = *outpos+repsize;
8139 if (outsize<requiredsize)
8140 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8141 Py_DECREF(rep);
8142 return enc_EXCEPTION;
8143 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008144 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008145 memcpy(outstart + *outpos, repchars, repsize);
8146 *outpos += repsize;
8147 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 Py_DECREF(rep);
8150 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151}
8152
8153/* handle an error in PyUnicode_EncodeCharmap
8154 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008155static int
8156charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00008157 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008158 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008159 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008160 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008161{
8162 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008163 Py_ssize_t repsize;
8164 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165 Py_UNICODE *uni2;
8166 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t collstartpos = *inpos;
8168 Py_ssize_t collendpos = *inpos+1;
8169 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008170 char *encoding = "charmap";
8171 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174 /* find all unencodable characters */
8175 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008177 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 int res = encoding_map_lookup(p[collendpos], mapping);
8179 if (res != -1)
8180 break;
8181 ++collendpos;
8182 continue;
8183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 rep = charmapencode_lookup(p[collendpos], mapping);
8186 if (rep==NULL)
8187 return -1;
8188 else if (rep!=Py_None) {
8189 Py_DECREF(rep);
8190 break;
8191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008193 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008194 }
8195 /* cache callback name lookup
8196 * (if not done yet, i.e. it's the first error) */
8197 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 if ((errors==NULL) || (!strcmp(errors, "strict")))
8199 *known_errorHandler = 1;
8200 else if (!strcmp(errors, "replace"))
8201 *known_errorHandler = 2;
8202 else if (!strcmp(errors, "ignore"))
8203 *known_errorHandler = 3;
8204 else if (!strcmp(errors, "xmlcharrefreplace"))
8205 *known_errorHandler = 4;
8206 else
8207 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008208 }
8209 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008210 case 1: /* strict */
8211 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8212 return -1;
8213 case 2: /* replace */
8214 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 x = charmapencode_output('?', mapping, res, respos);
8216 if (x==enc_EXCEPTION) {
8217 return -1;
8218 }
8219 else if (x==enc_FAILED) {
8220 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8221 return -1;
8222 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008223 }
8224 /* fall through */
8225 case 3: /* ignore */
8226 *inpos = collendpos;
8227 break;
8228 case 4: /* xmlcharrefreplace */
8229 /* generate replacement (temporarily (mis)uses p) */
8230 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008231 char buffer[2+29+1+1];
8232 char *cp;
8233 sprintf(buffer, "&#%d;", (int)p[collpos]);
8234 for (cp = buffer; *cp; ++cp) {
8235 x = charmapencode_output(*cp, mapping, res, respos);
8236 if (x==enc_EXCEPTION)
8237 return -1;
8238 else if (x==enc_FAILED) {
8239 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8240 return -1;
8241 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008242 }
8243 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008244 *inpos = collendpos;
8245 break;
8246 default:
8247 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 encoding, reason, p, size, exceptionObject,
8249 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008250 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008252 if (PyBytes_Check(repunicode)) {
8253 /* Directly copy bytes result to output. */
8254 Py_ssize_t outsize = PyBytes_Size(*res);
8255 Py_ssize_t requiredsize;
8256 repsize = PyBytes_Size(repunicode);
8257 requiredsize = *respos + repsize;
8258 if (requiredsize > outsize)
8259 /* Make room for all additional bytes. */
8260 if (charmapencode_resize(res, respos, requiredsize)) {
8261 Py_DECREF(repunicode);
8262 return -1;
8263 }
8264 memcpy(PyBytes_AsString(*res) + *respos,
8265 PyBytes_AsString(repunicode), repsize);
8266 *respos += repsize;
8267 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008268 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008269 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008270 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 /* generate replacement */
8272 repsize = PyUnicode_GET_SIZE(repunicode);
8273 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 x = charmapencode_output(*uni2, mapping, res, respos);
8275 if (x==enc_EXCEPTION) {
8276 return -1;
8277 }
8278 else if (x==enc_FAILED) {
8279 Py_DECREF(repunicode);
8280 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8281 return -1;
8282 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008283 }
8284 *inpos = newpos;
8285 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008286 }
8287 return 0;
8288}
8289
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290PyObject *
8291PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8292 Py_ssize_t size,
8293 PyObject *mapping,
8294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008296 /* output object */
8297 PyObject *res = NULL;
8298 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008299 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008301 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 PyObject *errorHandler = NULL;
8303 PyObject *exc = NULL;
8304 /* the following variable is used for caching string comparisons
8305 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8306 * 3=ignore, 4=xmlcharrefreplace */
8307 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308
8309 /* Default to Latin-1 */
8310 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008313 /* allocate enough for a simple encoding without
8314 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008315 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008316 if (res == NULL)
8317 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008318 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008320
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008321 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 /* try to encode it */
8323 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
8324 if (x==enc_EXCEPTION) /* error */
8325 goto onError;
8326 if (x==enc_FAILED) { /* unencodable character */
8327 if (charmap_encoding_error(p, size, &inpos, mapping,
8328 &exc,
8329 &known_errorHandler, &errorHandler, errors,
8330 &res, &respos)) {
8331 goto onError;
8332 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008333 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 else
8335 /* done with this character => adjust input position */
8336 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008340 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008341 if (_PyBytes_Resize(&res, respos) < 0)
8342 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008343
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008344 Py_XDECREF(exc);
8345 Py_XDECREF(errorHandler);
8346 return res;
8347
Benjamin Peterson29060642009-01-31 22:14:21 +00008348 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 Py_XDECREF(res);
8350 Py_XDECREF(exc);
8351 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008352 return NULL;
8353}
8354
Alexander Belopolsky40018472011-02-26 01:02:56 +00008355PyObject *
8356PyUnicode_AsCharmapString(PyObject *unicode,
8357 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008358{
8359 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008360 PyErr_BadArgument();
8361 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362 }
8363 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 PyUnicode_GET_SIZE(unicode),
8365 mapping,
8366 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static void
8371make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373 Py_ssize_t startpos, Py_ssize_t endpos,
8374 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 *exceptionObject = _PyUnicodeTranslateError_Create(
8378 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
8380 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8384 goto onError;
8385 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8386 goto onError;
8387 return;
8388 onError:
8389 Py_DECREF(*exceptionObject);
8390 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391 }
8392}
8393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008394/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395static void
8396raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008398 Py_ssize_t startpos, Py_ssize_t endpos,
8399 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008400{
8401 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008402 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008404 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405}
8406
8407/* error handling callback helper:
8408 build arguments, call the callback and check the arguments,
8409 put the result into newpos and return the replacement string, which
8410 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008411static PyObject *
8412unicode_translate_call_errorhandler(const char *errors,
8413 PyObject **errorHandler,
8414 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008416 Py_ssize_t startpos, Py_ssize_t endpos,
8417 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008419 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008421 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 PyObject *restuple;
8423 PyObject *resunicode;
8424
8425 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008426 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008427 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430
8431 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008432 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435
8436 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008437 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008441 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 Py_DECREF(restuple);
8443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 }
8445 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 &resunicode, &i_newpos)) {
8447 Py_DECREF(restuple);
8448 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008450 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008451 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008452 else
8453 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8456 Py_DECREF(restuple);
8457 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008458 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 Py_INCREF(resunicode);
8460 Py_DECREF(restuple);
8461 return resunicode;
8462}
8463
8464/* Lookup the character ch in the mapping and put the result in result,
8465 which must be decrefed by the caller.
8466 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008467static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008468charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008469{
Christian Heimes217cfd12007-12-02 14:31:20 +00008470 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 PyObject *x;
8472
8473 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 x = PyObject_GetItem(mapping, w);
8476 Py_DECREF(w);
8477 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8479 /* No mapping found means: use 1:1 mapping. */
8480 PyErr_Clear();
8481 *result = NULL;
8482 return 0;
8483 } else
8484 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008485 }
8486 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 *result = x;
8488 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008490 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 long value = PyLong_AS_LONG(x);
8492 long max = PyUnicode_GetMax();
8493 if (value < 0 || value > max) {
8494 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008495 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 Py_DECREF(x);
8497 return -1;
8498 }
8499 *result = x;
8500 return 0;
8501 }
8502 else if (PyUnicode_Check(x)) {
8503 *result = x;
8504 return 0;
8505 }
8506 else {
8507 /* wrong return value */
8508 PyErr_SetString(PyExc_TypeError,
8509 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 Py_DECREF(x);
8511 return -1;
8512 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008513}
8514/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 if not reallocate and adjust various state variables.
8516 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008517static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008518charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008522 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 /* exponentially overallocate to minimize reallocations */
8524 if (requiredsize < 2 * oldsize)
8525 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8527 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 }
8531 return 0;
8532}
8533/* lookup the character, put the result in the output string and adjust
8534 various state variables. Return a new reference to the object that
8535 was put in the output buffer in *result, or Py_None, if the mapping was
8536 undefined (in which case no character was written).
8537 The called must decref result.
8538 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008539static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008540charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8541 PyObject *mapping, Py_UCS4 **output,
8542 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008544{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008545 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8546 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008550 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008551 }
8552 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008554 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 }
8558 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008559 Py_ssize_t repsize;
8560 if (PyUnicode_READY(*res) == -1)
8561 return -1;
8562 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 if (repsize==1) {
8564 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008565 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008566 }
8567 else if (repsize!=0) {
8568 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 Py_ssize_t requiredsize = *opos +
8570 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 Py_ssize_t i;
8573 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008574 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008575 for(i = 0; i < repsize; i++)
8576 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 }
8579 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 return 0;
8582}
8583
Alexander Belopolsky40018472011-02-26 01:02:56 +00008584PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585_PyUnicode_TranslateCharmap(PyObject *input,
8586 PyObject *mapping,
8587 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589 /* input object */
8590 char *idata;
8591 Py_ssize_t size, i;
8592 int kind;
8593 /* output buffer */
8594 Py_UCS4 *output = NULL;
8595 Py_ssize_t osize;
8596 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008598 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 char *reason = "character maps to <undefined>";
8600 PyObject *errorHandler = NULL;
8601 PyObject *exc = NULL;
8602 /* the following variable is used for caching string comparisons
8603 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8604 * 3=ignore, 4=xmlcharrefreplace */
8605 int known_errorHandler = -1;
8606
Guido van Rossumd57fd912000-03-10 22:53:23 +00008607 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 PyErr_BadArgument();
8609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008612 if (PyUnicode_READY(input) == -1)
8613 return NULL;
8614 idata = (char*)PyUnicode_DATA(input);
8615 kind = PyUnicode_KIND(input);
8616 size = PyUnicode_GET_LENGTH(input);
8617 i = 0;
8618
8619 if (size == 0) {
8620 Py_INCREF(input);
8621 return input;
8622 }
8623
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008624 /* allocate enough for a simple 1:1 translation without
8625 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 osize = size;
8627 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8628 opos = 0;
8629 if (output == NULL) {
8630 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008631 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008634 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 /* try to encode it */
8636 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 if (charmaptranslate_output(input, i, mapping,
8638 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 Py_XDECREF(x);
8640 goto onError;
8641 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008642 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008645 else { /* untranslatable character */
8646 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8647 Py_ssize_t repsize;
8648 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 Py_ssize_t collstart = i;
8652 Py_ssize_t collend = i+1;
8653 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008654
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008656 while (collend < size) {
8657 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 goto onError;
8659 Py_XDECREF(x);
8660 if (x!=Py_None)
8661 break;
8662 ++collend;
8663 }
8664 /* cache callback name lookup
8665 * (if not done yet, i.e. it's the first error) */
8666 if (known_errorHandler==-1) {
8667 if ((errors==NULL) || (!strcmp(errors, "strict")))
8668 known_errorHandler = 1;
8669 else if (!strcmp(errors, "replace"))
8670 known_errorHandler = 2;
8671 else if (!strcmp(errors, "ignore"))
8672 known_errorHandler = 3;
8673 else if (!strcmp(errors, "xmlcharrefreplace"))
8674 known_errorHandler = 4;
8675 else
8676 known_errorHandler = 0;
8677 }
8678 switch (known_errorHandler) {
8679 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680 raise_translate_exception(&exc, input, collstart,
8681 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008682 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 case 2: /* replace */
8684 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008685 for (coll = collstart; coll<collend; coll++)
8686 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 /* fall through */
8688 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 break;
8691 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692 /* generate replacement (temporarily (mis)uses i) */
8693 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 char buffer[2+29+1+1];
8695 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8697 if (charmaptranslate_makespace(&output, &osize,
8698 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008699 goto onError;
8700 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 break;
8705 default:
8706 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 reason, input, &exc,
8708 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008709 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 goto onError;
8711 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 repsize = PyUnicode_GET_LENGTH(repunicode);
8713 if (charmaptranslate_makespace(&output, &osize,
8714 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 Py_DECREF(repunicode);
8716 goto onError;
8717 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 for (uni2 = 0; repsize-->0; ++uni2)
8719 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8720 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008722 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008723 }
8724 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008725 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8726 if (!res)
8727 goto onError;
8728 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 Py_XDECREF(exc);
8730 Py_XDECREF(errorHandler);
8731 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008732
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 Py_XDECREF(exc);
8736 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008737 return NULL;
8738}
8739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740/* Deprecated. Use PyUnicode_Translate instead. */
8741PyObject *
8742PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8743 Py_ssize_t size,
8744 PyObject *mapping,
8745 const char *errors)
8746{
8747 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8748 if (!unicode)
8749 return NULL;
8750 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8751}
8752
Alexander Belopolsky40018472011-02-26 01:02:56 +00008753PyObject *
8754PyUnicode_Translate(PyObject *str,
8755 PyObject *mapping,
8756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757{
8758 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 str = PyUnicode_FromObject(str);
8761 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 Py_DECREF(str);
8765 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008766
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 Py_XDECREF(str);
8769 return NULL;
8770}
Tim Petersced69f82003-09-16 20:30:58 +00008771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008773fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774{
8775 /* No need to call PyUnicode_READY(self) because this function is only
8776 called as a callback from fixup() which does it already. */
8777 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8778 const int kind = PyUnicode_KIND(self);
8779 void *data = PyUnicode_DATA(self);
8780 Py_UCS4 maxchar = 0, ch, fixed;
8781 Py_ssize_t i;
8782
8783 for (i = 0; i < len; ++i) {
8784 ch = PyUnicode_READ(kind, data, i);
8785 fixed = 0;
8786 if (ch > 127) {
8787 if (Py_UNICODE_ISSPACE(ch))
8788 fixed = ' ';
8789 else {
8790 const int decimal = Py_UNICODE_TODECIMAL(ch);
8791 if (decimal >= 0)
8792 fixed = '0' + decimal;
8793 }
8794 if (fixed != 0) {
8795 if (fixed > maxchar)
8796 maxchar = fixed;
8797 PyUnicode_WRITE(kind, data, i, fixed);
8798 }
8799 else if (ch > maxchar)
8800 maxchar = ch;
8801 }
8802 else if (ch > maxchar)
8803 maxchar = ch;
8804 }
8805
8806 return maxchar;
8807}
8808
8809PyObject *
8810_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8811{
8812 if (!PyUnicode_Check(unicode)) {
8813 PyErr_BadInternalCall();
8814 return NULL;
8815 }
8816 if (PyUnicode_READY(unicode) == -1)
8817 return NULL;
8818 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8819 /* If the string is already ASCII, just return the same string */
8820 Py_INCREF(unicode);
8821 return unicode;
8822 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008823 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824}
8825
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008826PyObject *
8827PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8828 Py_ssize_t length)
8829{
8830 PyObject *result;
8831 Py_UNICODE *p; /* write pointer into result */
8832 Py_ssize_t i;
8833 /* Copy to a new string */
8834 result = (PyObject *)_PyUnicode_New(length);
8835 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8836 if (result == NULL)
8837 return result;
8838 p = PyUnicode_AS_UNICODE(result);
8839 /* Iterate over code points */
8840 for (i = 0; i < length; i++) {
8841 Py_UNICODE ch =s[i];
8842 if (ch > 127) {
8843 int decimal = Py_UNICODE_TODECIMAL(ch);
8844 if (decimal >= 0)
8845 p[i] = '0' + decimal;
8846 }
8847 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008848#ifndef DONT_MAKE_RESULT_READY
8849 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 Py_DECREF(result);
8851 return NULL;
8852 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008853#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008854 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008855 return result;
8856}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008857/* --- Decimal Encoder ---------------------------------------------------- */
8858
Alexander Belopolsky40018472011-02-26 01:02:56 +00008859int
8860PyUnicode_EncodeDecimal(Py_UNICODE *s,
8861 Py_ssize_t length,
8862 char *output,
8863 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864{
8865 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866 PyObject *errorHandler = NULL;
8867 PyObject *exc = NULL;
8868 const char *encoding = "decimal";
8869 const char *reason = "invalid decimal Unicode string";
8870 /* the following variable is used for caching string comparisons
8871 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8872 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008873
8874 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008875 PyErr_BadArgument();
8876 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008877 }
8878
8879 p = s;
8880 end = s + length;
8881 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 register Py_UNICODE ch = *p;
8883 int decimal;
8884 PyObject *repunicode;
8885 Py_ssize_t repsize;
8886 Py_ssize_t newpos;
8887 Py_UNICODE *uni2;
8888 Py_UNICODE *collstart;
8889 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008890
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008892 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 ++p;
8894 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008895 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 decimal = Py_UNICODE_TODECIMAL(ch);
8897 if (decimal >= 0) {
8898 *output++ = '0' + decimal;
8899 ++p;
8900 continue;
8901 }
8902 if (0 < ch && ch < 256) {
8903 *output++ = (char)ch;
8904 ++p;
8905 continue;
8906 }
8907 /* All other characters are considered unencodable */
8908 collstart = p;
8909 collend = p+1;
8910 while (collend < end) {
8911 if ((0 < *collend && *collend < 256) ||
8912 !Py_UNICODE_ISSPACE(*collend) ||
8913 Py_UNICODE_TODECIMAL(*collend))
8914 break;
8915 }
8916 /* cache callback name lookup
8917 * (if not done yet, i.e. it's the first error) */
8918 if (known_errorHandler==-1) {
8919 if ((errors==NULL) || (!strcmp(errors, "strict")))
8920 known_errorHandler = 1;
8921 else if (!strcmp(errors, "replace"))
8922 known_errorHandler = 2;
8923 else if (!strcmp(errors, "ignore"))
8924 known_errorHandler = 3;
8925 else if (!strcmp(errors, "xmlcharrefreplace"))
8926 known_errorHandler = 4;
8927 else
8928 known_errorHandler = 0;
8929 }
8930 switch (known_errorHandler) {
8931 case 1: /* strict */
8932 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8933 goto onError;
8934 case 2: /* replace */
8935 for (p = collstart; p < collend; ++p)
8936 *output++ = '?';
8937 /* fall through */
8938 case 3: /* ignore */
8939 p = collend;
8940 break;
8941 case 4: /* xmlcharrefreplace */
8942 /* generate replacement (temporarily (mis)uses p) */
8943 for (p = collstart; p < collend; ++p)
8944 output += sprintf(output, "&#%d;", (int)*p);
8945 p = collend;
8946 break;
8947 default:
8948 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8949 encoding, reason, s, length, &exc,
8950 collstart-s, collend-s, &newpos);
8951 if (repunicode == NULL)
8952 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008953 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008954 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008955 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8956 Py_DECREF(repunicode);
8957 goto onError;
8958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 /* generate replacement */
8960 repsize = PyUnicode_GET_SIZE(repunicode);
8961 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8962 Py_UNICODE ch = *uni2;
8963 if (Py_UNICODE_ISSPACE(ch))
8964 *output++ = ' ';
8965 else {
8966 decimal = Py_UNICODE_TODECIMAL(ch);
8967 if (decimal >= 0)
8968 *output++ = '0' + decimal;
8969 else if (0 < ch && ch < 256)
8970 *output++ = (char)ch;
8971 else {
8972 Py_DECREF(repunicode);
8973 raise_encode_exception(&exc, encoding,
8974 s, length, collstart-s, collend-s, reason);
8975 goto onError;
8976 }
8977 }
8978 }
8979 p = s + newpos;
8980 Py_DECREF(repunicode);
8981 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008982 }
8983 /* 0-terminate the output string */
8984 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 Py_XDECREF(exc);
8986 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008987 return 0;
8988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008990 Py_XDECREF(exc);
8991 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008992 return -1;
8993}
8994
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995/* --- Helpers ------------------------------------------------------------ */
8996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008998any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008999 Py_ssize_t start,
9000 Py_ssize_t end)
9001{
9002 int kind1, kind2, kind;
9003 void *buf1, *buf2;
9004 Py_ssize_t len1, len2, result;
9005
9006 kind1 = PyUnicode_KIND(s1);
9007 kind2 = PyUnicode_KIND(s2);
9008 kind = kind1 > kind2 ? kind1 : kind2;
9009 buf1 = PyUnicode_DATA(s1);
9010 buf2 = PyUnicode_DATA(s2);
9011 if (kind1 != kind)
9012 buf1 = _PyUnicode_AsKind(s1, kind);
9013 if (!buf1)
9014 return -2;
9015 if (kind2 != kind)
9016 buf2 = _PyUnicode_AsKind(s2, kind);
9017 if (!buf2) {
9018 if (kind1 != kind) PyMem_Free(buf1);
9019 return -2;
9020 }
9021 len1 = PyUnicode_GET_LENGTH(s1);
9022 len2 = PyUnicode_GET_LENGTH(s2);
9023
Victor Stinner794d5672011-10-10 03:21:36 +02009024 if (direction > 0) {
9025 switch(kind) {
9026 case PyUnicode_1BYTE_KIND:
9027 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9028 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9029 else
9030 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9031 break;
9032 case PyUnicode_2BYTE_KIND:
9033 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9034 break;
9035 case PyUnicode_4BYTE_KIND:
9036 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9037 break;
9038 default:
9039 assert(0); result = -2;
9040 }
9041 }
9042 else {
9043 switch(kind) {
9044 case PyUnicode_1BYTE_KIND:
9045 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9046 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9047 else
9048 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9049 break;
9050 case PyUnicode_2BYTE_KIND:
9051 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9052 break;
9053 case PyUnicode_4BYTE_KIND:
9054 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055 break;
9056 default:
9057 assert(0); result = -2;
9058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 }
9060
9061 if (kind1 != kind)
9062 PyMem_Free(buf1);
9063 if (kind2 != kind)
9064 PyMem_Free(buf2);
9065
9066 return result;
9067}
9068
9069Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009070_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009071 Py_ssize_t n_buffer,
9072 void *digits, Py_ssize_t n_digits,
9073 Py_ssize_t min_width,
9074 const char *grouping,
9075 const char *thousands_sep)
9076{
9077 switch(kind) {
9078 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009079 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9080 return _PyUnicode_ascii_InsertThousandsGrouping(
9081 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9082 min_width, grouping, thousands_sep);
9083 else
9084 return _PyUnicode_ucs1_InsertThousandsGrouping(
9085 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9086 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009087 case PyUnicode_2BYTE_KIND:
9088 return _PyUnicode_ucs2_InsertThousandsGrouping(
9089 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9090 min_width, grouping, thousands_sep);
9091 case PyUnicode_4BYTE_KIND:
9092 return _PyUnicode_ucs4_InsertThousandsGrouping(
9093 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9094 min_width, grouping, thousands_sep);
9095 }
9096 assert(0);
9097 return -1;
9098}
9099
9100
Thomas Wouters477c8d52006-05-27 19:21:47 +00009101/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009102#define ADJUST_INDICES(start, end, len) \
9103 if (end > len) \
9104 end = len; \
9105 else if (end < 0) { \
9106 end += len; \
9107 if (end < 0) \
9108 end = 0; \
9109 } \
9110 if (start < 0) { \
9111 start += len; \
9112 if (start < 0) \
9113 start = 0; \
9114 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009115
Alexander Belopolsky40018472011-02-26 01:02:56 +00009116Py_ssize_t
9117PyUnicode_Count(PyObject *str,
9118 PyObject *substr,
9119 Py_ssize_t start,
9120 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009121{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009122 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009123 PyUnicodeObject* str_obj;
9124 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 int kind1, kind2, kind;
9126 void *buf1 = NULL, *buf2 = NULL;
9127 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009128
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009130 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009131 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009132 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009133 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 Py_DECREF(str_obj);
9135 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 }
Tim Petersced69f82003-09-16 20:30:58 +00009137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 kind1 = PyUnicode_KIND(str_obj);
9139 kind2 = PyUnicode_KIND(sub_obj);
9140 kind = kind1 > kind2 ? kind1 : kind2;
9141 buf1 = PyUnicode_DATA(str_obj);
9142 if (kind1 != kind)
9143 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
9144 if (!buf1)
9145 goto onError;
9146 buf2 = PyUnicode_DATA(sub_obj);
9147 if (kind2 != kind)
9148 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
9149 if (!buf2)
9150 goto onError;
9151 len1 = PyUnicode_GET_LENGTH(str_obj);
9152 len2 = PyUnicode_GET_LENGTH(sub_obj);
9153
9154 ADJUST_INDICES(start, end, len1);
9155 switch(kind) {
9156 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009157 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9158 result = asciilib_count(
9159 ((Py_UCS1*)buf1) + start, end - start,
9160 buf2, len2, PY_SSIZE_T_MAX
9161 );
9162 else
9163 result = ucs1lib_count(
9164 ((Py_UCS1*)buf1) + start, end - start,
9165 buf2, len2, PY_SSIZE_T_MAX
9166 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 break;
9168 case PyUnicode_2BYTE_KIND:
9169 result = ucs2lib_count(
9170 ((Py_UCS2*)buf1) + start, end - start,
9171 buf2, len2, PY_SSIZE_T_MAX
9172 );
9173 break;
9174 case PyUnicode_4BYTE_KIND:
9175 result = ucs4lib_count(
9176 ((Py_UCS4*)buf1) + start, end - start,
9177 buf2, len2, PY_SSIZE_T_MAX
9178 );
9179 break;
9180 default:
9181 assert(0); result = 0;
9182 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009183
9184 Py_DECREF(sub_obj);
9185 Py_DECREF(str_obj);
9186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 if (kind1 != kind)
9188 PyMem_Free(buf1);
9189 if (kind2 != kind)
9190 PyMem_Free(buf2);
9191
Guido van Rossumd57fd912000-03-10 22:53:23 +00009192 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 onError:
9194 Py_DECREF(sub_obj);
9195 Py_DECREF(str_obj);
9196 if (kind1 != kind && buf1)
9197 PyMem_Free(buf1);
9198 if (kind2 != kind && buf2)
9199 PyMem_Free(buf2);
9200 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009201}
9202
Alexander Belopolsky40018472011-02-26 01:02:56 +00009203Py_ssize_t
9204PyUnicode_Find(PyObject *str,
9205 PyObject *sub,
9206 Py_ssize_t start,
9207 Py_ssize_t end,
9208 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009210 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009211
Guido van Rossumd57fd912000-03-10 22:53:23 +00009212 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009213 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009215 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009216 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009217 Py_DECREF(str);
9218 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009219 }
Tim Petersced69f82003-09-16 20:30:58 +00009220
Victor Stinner794d5672011-10-10 03:21:36 +02009221 result = any_find_slice(direction,
9222 str, sub, start, end
9223 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009226 Py_DECREF(sub);
9227
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 return result;
9229}
9230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231Py_ssize_t
9232PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9233 Py_ssize_t start, Py_ssize_t end,
9234 int direction)
9235{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009236 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009237 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009238 if (PyUnicode_READY(str) == -1)
9239 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009240 if (start < 0 || end < 0) {
9241 PyErr_SetString(PyExc_IndexError, "string index out of range");
9242 return -2;
9243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244 if (end > PyUnicode_GET_LENGTH(str))
9245 end = PyUnicode_GET_LENGTH(str);
9246 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009247 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9248 kind, end-start, ch, direction);
9249 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009250 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009251 else
9252 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253}
9254
Alexander Belopolsky40018472011-02-26 01:02:56 +00009255static int
9256tailmatch(PyUnicodeObject *self,
9257 PyUnicodeObject *substring,
9258 Py_ssize_t start,
9259 Py_ssize_t end,
9260 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009262 int kind_self;
9263 int kind_sub;
9264 void *data_self;
9265 void *data_sub;
9266 Py_ssize_t offset;
9267 Py_ssize_t i;
9268 Py_ssize_t end_sub;
9269
9270 if (PyUnicode_READY(self) == -1 ||
9271 PyUnicode_READY(substring) == -1)
9272 return 0;
9273
9274 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009275 return 1;
9276
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9278 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009280 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 kind_self = PyUnicode_KIND(self);
9283 data_self = PyUnicode_DATA(self);
9284 kind_sub = PyUnicode_KIND(substring);
9285 data_sub = PyUnicode_DATA(substring);
9286 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9287
9288 if (direction > 0)
9289 offset = end;
9290 else
9291 offset = start;
9292
9293 if (PyUnicode_READ(kind_self, data_self, offset) ==
9294 PyUnicode_READ(kind_sub, data_sub, 0) &&
9295 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9296 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9297 /* If both are of the same kind, memcmp is sufficient */
9298 if (kind_self == kind_sub) {
9299 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009300 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301 data_sub,
9302 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009303 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009304 }
9305 /* otherwise we have to compare each character by first accesing it */
9306 else {
9307 /* We do not need to compare 0 and len(substring)-1 because
9308 the if statement above ensured already that they are equal
9309 when we end up here. */
9310 // TODO: honor direction and do a forward or backwards search
9311 for (i = 1; i < end_sub; ++i) {
9312 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9313 PyUnicode_READ(kind_sub, data_sub, i))
9314 return 0;
9315 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 }
9319
9320 return 0;
9321}
9322
Alexander Belopolsky40018472011-02-26 01:02:56 +00009323Py_ssize_t
9324PyUnicode_Tailmatch(PyObject *str,
9325 PyObject *substr,
9326 Py_ssize_t start,
9327 Py_ssize_t end,
9328 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009329{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009330 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009331
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 str = PyUnicode_FromObject(str);
9333 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009334 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335 substr = PyUnicode_FromObject(substr);
9336 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 Py_DECREF(str);
9338 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339 }
Tim Petersced69f82003-09-16 20:30:58 +00009340
Guido van Rossumd57fd912000-03-10 22:53:23 +00009341 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00009342 (PyUnicodeObject *)substr,
9343 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009344 Py_DECREF(str);
9345 Py_DECREF(substr);
9346 return result;
9347}
9348
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349/* Apply fixfct filter to the Unicode object self and return a
9350 reference to the modified object */
9351
Alexander Belopolsky40018472011-02-26 01:02:56 +00009352static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009353fixup(PyObject *self,
9354 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 PyObject *u;
9357 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 if (PyUnicode_READY(self) == -1)
9360 return NULL;
9361 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9362 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9363 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009365 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009368 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009369
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009370 /* fix functions return the new maximum character in a string,
9371 if the kind of the resulting unicode object does not change,
9372 everything is fine. Otherwise we need to change the string kind
9373 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009374 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009375 if (maxchar_new == 0)
9376 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9377 else if (maxchar_new <= 127)
9378 maxchar_new = 127;
9379 else if (maxchar_new <= 255)
9380 maxchar_new = 255;
9381 else if (maxchar_new <= 65535)
9382 maxchar_new = 65535;
9383 else
9384 maxchar_new = 1114111; /* 0x10ffff */
9385
9386 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009387 /* fixfct should return TRUE if it modified the buffer. If
9388 FALSE, return a reference to the original buffer instead
9389 (to save space, not time) */
9390 Py_INCREF(self);
9391 Py_DECREF(u);
9392 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 else if (maxchar_new == maxchar_old) {
9395 return u;
9396 }
9397 else {
9398 /* In case the maximum character changed, we need to
9399 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009400 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009401 if (v == NULL) {
9402 Py_DECREF(u);
9403 return NULL;
9404 }
9405 if (maxchar_new > maxchar_old) {
9406 /* If the maxchar increased so that the kind changed, not all
9407 characters are representable anymore and we need to fix the
9408 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009409 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009410 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9412 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009413 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009414 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009415 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009416
9417 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009418 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 return v;
9420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009421}
9422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009424fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426 /* No need to call PyUnicode_READY(self) because this function is only
9427 called as a callback from fixup() which does it already. */
9428 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9429 const int kind = PyUnicode_KIND(self);
9430 void *data = PyUnicode_DATA(self);
9431 int touched = 0;
9432 Py_UCS4 maxchar = 0;
9433 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 for (i = 0; i < len; ++i) {
9436 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9437 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9438 if (up != ch) {
9439 if (up > maxchar)
9440 maxchar = up;
9441 PyUnicode_WRITE(kind, data, i, up);
9442 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 else if (ch > maxchar)
9445 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446 }
9447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 if (touched)
9449 return maxchar;
9450 else
9451 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452}
9453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009454static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009455fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9458 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9459 const int kind = PyUnicode_KIND(self);
9460 void *data = PyUnicode_DATA(self);
9461 int touched = 0;
9462 Py_UCS4 maxchar = 0;
9463 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465 for(i = 0; i < len; ++i) {
9466 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9467 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9468 if (lo != ch) {
9469 if (lo > maxchar)
9470 maxchar = lo;
9471 PyUnicode_WRITE(kind, data, i, lo);
9472 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009474 else if (ch > maxchar)
9475 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 }
9477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 if (touched)
9479 return maxchar;
9480 else
9481 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482}
9483
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009485fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009487 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9488 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9489 const int kind = PyUnicode_KIND(self);
9490 void *data = PyUnicode_DATA(self);
9491 int touched = 0;
9492 Py_UCS4 maxchar = 0;
9493 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495 for(i = 0; i < len; ++i) {
9496 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9497 Py_UCS4 nu = 0;
9498
9499 if (Py_UNICODE_ISUPPER(ch))
9500 nu = Py_UNICODE_TOLOWER(ch);
9501 else if (Py_UNICODE_ISLOWER(ch))
9502 nu = Py_UNICODE_TOUPPER(ch);
9503
9504 if (nu != 0) {
9505 if (nu > maxchar)
9506 maxchar = nu;
9507 PyUnicode_WRITE(kind, data, i, nu);
9508 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009510 else if (ch > maxchar)
9511 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009512 }
9513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 if (touched)
9515 return maxchar;
9516 else
9517 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518}
9519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009521fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9524 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9525 const int kind = PyUnicode_KIND(self);
9526 void *data = PyUnicode_DATA(self);
9527 int touched = 0;
9528 Py_UCS4 maxchar = 0;
9529 Py_ssize_t i = 0;
9530 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009531
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009532 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009533 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534
9535 ch = PyUnicode_READ(kind, data, i);
9536 if (!Py_UNICODE_ISUPPER(ch)) {
9537 maxchar = Py_UNICODE_TOUPPER(ch);
9538 PyUnicode_WRITE(kind, data, i, maxchar);
9539 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 ++i;
9542 for(; i < len; ++i) {
9543 ch = PyUnicode_READ(kind, data, i);
9544 if (!Py_UNICODE_ISLOWER(ch)) {
9545 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9546 if (lo > maxchar)
9547 maxchar = lo;
9548 PyUnicode_WRITE(kind, data, i, lo);
9549 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 else if (ch > maxchar)
9552 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009553 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009554
9555 if (touched)
9556 return maxchar;
9557 else
9558 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009559}
9560
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009562fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9565 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9566 const int kind = PyUnicode_KIND(self);
9567 void *data = PyUnicode_DATA(self);
9568 Py_UCS4 maxchar = 0;
9569 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570 int previous_is_cased;
9571
9572 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573 if (len == 1) {
9574 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9575 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9576 if (ti != ch) {
9577 PyUnicode_WRITE(kind, data, i, ti);
9578 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009579 }
9580 else
9581 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009582 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 for(; i < len; ++i) {
9585 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9586 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009589 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591 nu = Py_UNICODE_TOTITLE(ch);
9592
9593 if (nu > maxchar)
9594 maxchar = nu;
9595 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009596
Benjamin Peterson29060642009-01-31 22:14:21 +00009597 if (Py_UNICODE_ISLOWER(ch) ||
9598 Py_UNICODE_ISUPPER(ch) ||
9599 Py_UNICODE_ISTITLE(ch))
9600 previous_is_cased = 1;
9601 else
9602 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605}
9606
Tim Peters8ce9f162004-08-27 01:49:32 +00009607PyObject *
9608PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009611 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009613 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009614 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9615 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009616 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009618 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009620 int use_memcpy;
9621 unsigned char *res_data = NULL, *sep_data = NULL;
9622 PyObject *last_obj;
9623 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624
Tim Peters05eba1f2004-08-27 21:32:02 +00009625 fseq = PySequence_Fast(seq, "");
9626 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009627 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009628 }
9629
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009630 /* NOTE: the following code can't call back into Python code,
9631 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009632 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009633
Tim Peters05eba1f2004-08-27 21:32:02 +00009634 seqlen = PySequence_Fast_GET_SIZE(fseq);
9635 /* If empty sequence, return u"". */
9636 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009637 Py_DECREF(fseq);
9638 Py_INCREF(unicode_empty);
9639 res = unicode_empty;
9640 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009641 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009642
Tim Peters05eba1f2004-08-27 21:32:02 +00009643 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009644 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009645 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009646 if (seqlen == 1) {
9647 if (PyUnicode_CheckExact(items[0])) {
9648 res = items[0];
9649 Py_INCREF(res);
9650 Py_DECREF(fseq);
9651 return res;
9652 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009653 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009654 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009655 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009656 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009657 /* Set up sep and seplen */
9658 if (separator == NULL) {
9659 /* fall back to a blank space separator */
9660 sep = PyUnicode_FromOrdinal(' ');
9661 if (!sep)
9662 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009663 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009664 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009665 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009666 else {
9667 if (!PyUnicode_Check(separator)) {
9668 PyErr_Format(PyExc_TypeError,
9669 "separator: expected str instance,"
9670 " %.80s found",
9671 Py_TYPE(separator)->tp_name);
9672 goto onError;
9673 }
9674 if (PyUnicode_READY(separator))
9675 goto onError;
9676 sep = separator;
9677 seplen = PyUnicode_GET_LENGTH(separator);
9678 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9679 /* inc refcount to keep this code path symmetric with the
9680 above case of a blank separator */
9681 Py_INCREF(sep);
9682 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009683 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009684 }
9685
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009686 /* There are at least two things to join, or else we have a subclass
9687 * of str in the sequence.
9688 * Do a pre-pass to figure out the total amount of space we'll
9689 * need (sz), and see whether all argument are strings.
9690 */
9691 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009692#ifdef Py_DEBUG
9693 use_memcpy = 0;
9694#else
9695 use_memcpy = 1;
9696#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009697 for (i = 0; i < seqlen; i++) {
9698 const Py_ssize_t old_sz = sz;
9699 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009700 if (!PyUnicode_Check(item)) {
9701 PyErr_Format(PyExc_TypeError,
9702 "sequence item %zd: expected str instance,"
9703 " %.80s found",
9704 i, Py_TYPE(item)->tp_name);
9705 goto onError;
9706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (PyUnicode_READY(item) == -1)
9708 goto onError;
9709 sz += PyUnicode_GET_LENGTH(item);
9710 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009711 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009712 if (i != 0)
9713 sz += seplen;
9714 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9715 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009716 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009717 goto onError;
9718 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009719 if (use_memcpy && last_obj != NULL) {
9720 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9721 use_memcpy = 0;
9722 }
9723 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009724 }
Tim Petersced69f82003-09-16 20:30:58 +00009725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009727 if (res == NULL)
9728 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009729
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009730 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009731#ifdef Py_DEBUG
9732 use_memcpy = 0;
9733#else
9734 if (use_memcpy) {
9735 res_data = PyUnicode_1BYTE_DATA(res);
9736 kind = PyUnicode_KIND(res);
9737 if (seplen != 0)
9738 sep_data = PyUnicode_1BYTE_DATA(sep);
9739 }
9740#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009742 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009743 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009745 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009746 if (use_memcpy) {
9747 Py_MEMCPY(res_data,
9748 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009749 kind * seplen);
9750 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009751 }
9752 else {
9753 copy_characters(res, res_offset, sep, 0, seplen);
9754 res_offset += seplen;
9755 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009756 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009757 itemlen = PyUnicode_GET_LENGTH(item);
9758 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009759 if (use_memcpy) {
9760 Py_MEMCPY(res_data,
9761 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009762 kind * itemlen);
9763 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009764 }
9765 else {
9766 copy_characters(res, res_offset, item, 0, itemlen);
9767 res_offset += itemlen;
9768 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009769 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009770 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009771 if (use_memcpy)
9772 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009773 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009774 else
9775 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009776
Tim Peters05eba1f2004-08-27 21:32:02 +00009777 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009779 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781
Benjamin Peterson29060642009-01-31 22:14:21 +00009782 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009783 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009785 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 return NULL;
9787}
9788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009789#define FILL(kind, data, value, start, length) \
9790 do { \
9791 Py_ssize_t i_ = 0; \
9792 assert(kind != PyUnicode_WCHAR_KIND); \
9793 switch ((kind)) { \
9794 case PyUnicode_1BYTE_KIND: { \
9795 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9796 memset(to_, (unsigned char)value, length); \
9797 break; \
9798 } \
9799 case PyUnicode_2BYTE_KIND: { \
9800 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9801 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9802 break; \
9803 } \
9804 default: { \
9805 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9806 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9807 break; \
9808 } \
9809 } \
9810 } while (0)
9811
Victor Stinner9310abb2011-10-05 00:59:23 +02009812static PyObject *
9813pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009814 Py_ssize_t left,
9815 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009817{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009818 PyObject *u;
9819 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009820 int kind;
9821 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822
9823 if (left < 0)
9824 left = 0;
9825 if (right < 0)
9826 right = 0;
9827
Tim Peters7a29bd52001-09-12 03:03:31 +00009828 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009829 Py_INCREF(self);
9830 return self;
9831 }
9832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9834 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009835 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9836 return NULL;
9837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9839 if (fill > maxchar)
9840 maxchar = fill;
9841 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009842 if (!u)
9843 return NULL;
9844
9845 kind = PyUnicode_KIND(u);
9846 data = PyUnicode_DATA(u);
9847 if (left)
9848 FILL(kind, data, fill, 0, left);
9849 if (right)
9850 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009851 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009852 assert(_PyUnicode_CheckConsistency(u, 1));
9853 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009854}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009856
Alexander Belopolsky40018472011-02-26 01:02:56 +00009857PyObject *
9858PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009861
9862 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009864 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 switch(PyUnicode_KIND(string)) {
9867 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009868 if (PyUnicode_IS_ASCII(string))
9869 list = asciilib_splitlines(
9870 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9871 PyUnicode_GET_LENGTH(string), keepends);
9872 else
9873 list = ucs1lib_splitlines(
9874 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9875 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break;
9877 case PyUnicode_2BYTE_KIND:
9878 list = ucs2lib_splitlines(
9879 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9880 PyUnicode_GET_LENGTH(string), keepends);
9881 break;
9882 case PyUnicode_4BYTE_KIND:
9883 list = ucs4lib_splitlines(
9884 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9885 PyUnicode_GET_LENGTH(string), keepends);
9886 break;
9887 default:
9888 assert(0);
9889 list = 0;
9890 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009891 Py_DECREF(string);
9892 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009893}
9894
Alexander Belopolsky40018472011-02-26 01:02:56 +00009895static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009896split(PyObject *self,
9897 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009898 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 int kind1, kind2, kind;
9901 void *buf1, *buf2;
9902 Py_ssize_t len1, len2;
9903 PyObject* out;
9904
Guido van Rossumd57fd912000-03-10 22:53:23 +00009905 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009906 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (PyUnicode_READY(self) == -1)
9909 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911 if (substring == NULL)
9912 switch(PyUnicode_KIND(self)) {
9913 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009914 if (PyUnicode_IS_ASCII(self))
9915 return asciilib_split_whitespace(
9916 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9917 PyUnicode_GET_LENGTH(self), maxcount
9918 );
9919 else
9920 return ucs1lib_split_whitespace(
9921 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9922 PyUnicode_GET_LENGTH(self), maxcount
9923 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009924 case PyUnicode_2BYTE_KIND:
9925 return ucs2lib_split_whitespace(
9926 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9927 PyUnicode_GET_LENGTH(self), maxcount
9928 );
9929 case PyUnicode_4BYTE_KIND:
9930 return ucs4lib_split_whitespace(
9931 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9932 PyUnicode_GET_LENGTH(self), maxcount
9933 );
9934 default:
9935 assert(0);
9936 return NULL;
9937 }
9938
9939 if (PyUnicode_READY(substring) == -1)
9940 return NULL;
9941
9942 kind1 = PyUnicode_KIND(self);
9943 kind2 = PyUnicode_KIND(substring);
9944 kind = kind1 > kind2 ? kind1 : kind2;
9945 buf1 = PyUnicode_DATA(self);
9946 buf2 = PyUnicode_DATA(substring);
9947 if (kind1 != kind)
9948 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9949 if (!buf1)
9950 return NULL;
9951 if (kind2 != kind)
9952 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9953 if (!buf2) {
9954 if (kind1 != kind) PyMem_Free(buf1);
9955 return NULL;
9956 }
9957 len1 = PyUnicode_GET_LENGTH(self);
9958 len2 = PyUnicode_GET_LENGTH(substring);
9959
9960 switch(kind) {
9961 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009962 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9963 out = asciilib_split(
9964 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9965 else
9966 out = ucs1lib_split(
9967 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009968 break;
9969 case PyUnicode_2BYTE_KIND:
9970 out = ucs2lib_split(
9971 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9972 break;
9973 case PyUnicode_4BYTE_KIND:
9974 out = ucs4lib_split(
9975 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9976 break;
9977 default:
9978 out = NULL;
9979 }
9980 if (kind1 != kind)
9981 PyMem_Free(buf1);
9982 if (kind2 != kind)
9983 PyMem_Free(buf2);
9984 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985}
9986
Alexander Belopolsky40018472011-02-26 01:02:56 +00009987static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009988rsplit(PyObject *self,
9989 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009990 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 int kind1, kind2, kind;
9993 void *buf1, *buf2;
9994 Py_ssize_t len1, len2;
9995 PyObject* out;
9996
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009997 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009998 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 if (PyUnicode_READY(self) == -1)
10001 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 if (substring == NULL)
10004 switch(PyUnicode_KIND(self)) {
10005 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010006 if (PyUnicode_IS_ASCII(self))
10007 return asciilib_rsplit_whitespace(
10008 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10009 PyUnicode_GET_LENGTH(self), maxcount
10010 );
10011 else
10012 return ucs1lib_rsplit_whitespace(
10013 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10014 PyUnicode_GET_LENGTH(self), maxcount
10015 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 case PyUnicode_2BYTE_KIND:
10017 return ucs2lib_rsplit_whitespace(
10018 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
10019 PyUnicode_GET_LENGTH(self), maxcount
10020 );
10021 case PyUnicode_4BYTE_KIND:
10022 return ucs4lib_rsplit_whitespace(
10023 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
10024 PyUnicode_GET_LENGTH(self), maxcount
10025 );
10026 default:
10027 assert(0);
10028 return NULL;
10029 }
10030
10031 if (PyUnicode_READY(substring) == -1)
10032 return NULL;
10033
10034 kind1 = PyUnicode_KIND(self);
10035 kind2 = PyUnicode_KIND(substring);
10036 kind = kind1 > kind2 ? kind1 : kind2;
10037 buf1 = PyUnicode_DATA(self);
10038 buf2 = PyUnicode_DATA(substring);
10039 if (kind1 != kind)
10040 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10041 if (!buf1)
10042 return NULL;
10043 if (kind2 != kind)
10044 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10045 if (!buf2) {
10046 if (kind1 != kind) PyMem_Free(buf1);
10047 return NULL;
10048 }
10049 len1 = PyUnicode_GET_LENGTH(self);
10050 len2 = PyUnicode_GET_LENGTH(substring);
10051
10052 switch(kind) {
10053 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010054 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10055 out = asciilib_rsplit(
10056 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10057 else
10058 out = ucs1lib_rsplit(
10059 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 break;
10061 case PyUnicode_2BYTE_KIND:
10062 out = ucs2lib_rsplit(
10063 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10064 break;
10065 case PyUnicode_4BYTE_KIND:
10066 out = ucs4lib_rsplit(
10067 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10068 break;
10069 default:
10070 out = NULL;
10071 }
10072 if (kind1 != kind)
10073 PyMem_Free(buf1);
10074 if (kind2 != kind)
10075 PyMem_Free(buf2);
10076 return out;
10077}
10078
10079static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010080anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10081 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082{
10083 switch(kind) {
10084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010085 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10086 return asciilib_find(buf1, len1, buf2, len2, offset);
10087 else
10088 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 case PyUnicode_2BYTE_KIND:
10090 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10091 case PyUnicode_4BYTE_KIND:
10092 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10093 }
10094 assert(0);
10095 return -1;
10096}
10097
10098static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010099anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10100 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101{
10102 switch(kind) {
10103 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010104 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10105 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10106 else
10107 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 case PyUnicode_2BYTE_KIND:
10109 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10110 case PyUnicode_4BYTE_KIND:
10111 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10112 }
10113 assert(0);
10114 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010115}
10116
Alexander Belopolsky40018472011-02-26 01:02:56 +000010117static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010118replace(PyObject *self, PyObject *str1,
10119 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 PyObject *u;
10122 char *sbuf = PyUnicode_DATA(self);
10123 char *buf1 = PyUnicode_DATA(str1);
10124 char *buf2 = PyUnicode_DATA(str2);
10125 int srelease = 0, release1 = 0, release2 = 0;
10126 int skind = PyUnicode_KIND(self);
10127 int kind1 = PyUnicode_KIND(str1);
10128 int kind2 = PyUnicode_KIND(str2);
10129 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10130 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10131 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010132 int mayshrink;
10133 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010134
10135 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010138 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010139
Victor Stinner59de0ee2011-10-07 10:01:28 +020010140 if (str1 == str2)
10141 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010142 if (skind < kind1)
10143 /* substring too wide to be present */
10144 goto nothing;
10145
Victor Stinner49a0a212011-10-12 23:46:10 +020010146 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10147 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10148 /* Replacing str1 with str2 may cause a maxchar reduction in the
10149 result string. */
10150 mayshrink = (maxchar_str2 < maxchar);
10151 maxchar = Py_MAX(maxchar, maxchar_str2);
10152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010154 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010155 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010157 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010160 Py_UCS4 u1, u2;
10161 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010163 if (findchar(sbuf, PyUnicode_KIND(self),
10164 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010167 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010168 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010170 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 rkind = PyUnicode_KIND(u);
10172 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10173 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010174 if (--maxcount < 0)
10175 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010177 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010178 }
10179 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 int rkind = skind;
10181 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (kind1 < rkind) {
10184 /* widen substring */
10185 buf1 = _PyUnicode_AsKind(str1, rkind);
10186 if (!buf1) goto error;
10187 release1 = 1;
10188 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010189 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010190 if (i < 0)
10191 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 if (rkind > kind2) {
10193 /* widen replacement */
10194 buf2 = _PyUnicode_AsKind(str2, rkind);
10195 if (!buf2) goto error;
10196 release2 = 1;
10197 }
10198 else if (rkind < kind2) {
10199 /* widen self and buf1 */
10200 rkind = kind2;
10201 if (release1) PyMem_Free(buf1);
10202 sbuf = _PyUnicode_AsKind(self, rkind);
10203 if (!sbuf) goto error;
10204 srelease = 1;
10205 buf1 = _PyUnicode_AsKind(str1, rkind);
10206 if (!buf1) goto error;
10207 release1 = 1;
10208 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010209 u = PyUnicode_New(slen, maxchar);
10210 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010212 assert(PyUnicode_KIND(u) == rkind);
10213 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010214
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010215 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010216 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010217 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010221
10222 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010223 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010224 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010226 if (i == -1)
10227 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010230 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010231 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010234 }
10235 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 Py_ssize_t n, i, j, ires;
10237 Py_ssize_t product, new_size;
10238 int rkind = skind;
10239 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010241 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010242 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010243 buf1 = _PyUnicode_AsKind(str1, rkind);
10244 if (!buf1) goto error;
10245 release1 = 1;
10246 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010247 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010248 if (n == 0)
10249 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010251 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 buf2 = _PyUnicode_AsKind(str2, rkind);
10253 if (!buf2) goto error;
10254 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010257 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 rkind = kind2;
10259 sbuf = _PyUnicode_AsKind(self, rkind);
10260 if (!sbuf) goto error;
10261 srelease = 1;
10262 if (release1) PyMem_Free(buf1);
10263 buf1 = _PyUnicode_AsKind(str1, rkind);
10264 if (!buf1) goto error;
10265 release1 = 1;
10266 }
10267 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10268 PyUnicode_GET_LENGTH(str1))); */
10269 product = n * (len2-len1);
10270 if ((product / (len2-len1)) != n) {
10271 PyErr_SetString(PyExc_OverflowError,
10272 "replace string is too long");
10273 goto error;
10274 }
10275 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010276 if (new_size == 0) {
10277 Py_INCREF(unicode_empty);
10278 u = unicode_empty;
10279 goto done;
10280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10282 PyErr_SetString(PyExc_OverflowError,
10283 "replace string is too long");
10284 goto error;
10285 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 u = PyUnicode_New(new_size, maxchar);
10287 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010289 assert(PyUnicode_KIND(u) == rkind);
10290 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 ires = i = 0;
10292 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010293 while (n-- > 0) {
10294 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010295 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010296 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010297 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010298 if (j == -1)
10299 break;
10300 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010302 memcpy(res + rkind * ires,
10303 sbuf + rkind * i,
10304 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010306 }
10307 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010309 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010311 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010318 memcpy(res + rkind * ires,
10319 sbuf + rkind * i,
10320 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010321 }
10322 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 /* interleave */
10324 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010325 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010326 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010327 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010329 if (--n <= 0)
10330 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 memcpy(res + rkind * ires,
10332 sbuf + rkind * i,
10333 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 ires++;
10335 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010336 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010337 memcpy(res + rkind * ires,
10338 sbuf + rkind * i,
10339 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010340 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010341 }
10342
10343 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010344 unicode_adjust_maxchar(&u);
10345 if (u == NULL)
10346 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010347 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010348
10349 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (srelease)
10351 PyMem_FREE(sbuf);
10352 if (release1)
10353 PyMem_FREE(buf1);
10354 if (release2)
10355 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010356 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010358
Benjamin Peterson29060642009-01-31 22:14:21 +000010359 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010360 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 if (srelease)
10362 PyMem_FREE(sbuf);
10363 if (release1)
10364 PyMem_FREE(buf1);
10365 if (release2)
10366 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010367 if (PyUnicode_CheckExact(self)) {
10368 Py_INCREF(self);
10369 return (PyObject *) self;
10370 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010371 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 error:
10373 if (srelease && sbuf)
10374 PyMem_FREE(sbuf);
10375 if (release1 && buf1)
10376 PyMem_FREE(buf1);
10377 if (release2 && buf2)
10378 PyMem_FREE(buf2);
10379 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010380}
10381
10382/* --- Unicode Object Methods --------------------------------------------- */
10383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010384PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010386\n\
10387Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010388characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010389
10390static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010391unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393 return fixup(self, fixtitle);
10394}
10395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010396PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010397 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010398\n\
10399Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010400have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401
10402static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010403unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405 return fixup(self, fixcapitalize);
10406}
10407
10408#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010410 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411\n\
10412Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010413normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414
10415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010416unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417{
10418 PyObject *list;
10419 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010420 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
Guido van Rossumd57fd912000-03-10 22:53:23 +000010422 /* Split into words */
10423 list = split(self, NULL, -1);
10424 if (!list)
10425 return NULL;
10426
10427 /* Capitalize each word */
10428 for (i = 0; i < PyList_GET_SIZE(list); i++) {
10429 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010431 if (item == NULL)
10432 goto onError;
10433 Py_DECREF(PyList_GET_ITEM(list, i));
10434 PyList_SET_ITEM(list, i, item);
10435 }
10436
10437 /* Join the words to form a new string */
10438 item = PyUnicode_Join(NULL, list);
10439
Benjamin Peterson29060642009-01-31 22:14:21 +000010440 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010441 Py_DECREF(list);
10442 return (PyObject *)item;
10443}
10444#endif
10445
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010446/* Argument converter. Coerces to a single unicode character */
10447
10448static int
10449convert_uc(PyObject *obj, void *addr)
10450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010451 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010452 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010453
Benjamin Peterson14339b62009-01-31 16:36:08 +000010454 uniobj = PyUnicode_FromObject(obj);
10455 if (uniobj == NULL) {
10456 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010457 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010458 return 0;
10459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010461 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010462 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010463 Py_DECREF(uniobj);
10464 return 0;
10465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 Py_DECREF(uniobj);
10468 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010469}
10470
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010471PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010472 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010473\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010474Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010475done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010476
10477static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010478unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010480 Py_ssize_t marg, left;
10481 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 Py_UCS4 fillchar = ' ';
10483
Victor Stinnere9a29352011-10-01 02:14:59 +020010484 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486
Victor Stinnere9a29352011-10-01 02:14:59 +020010487 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488 return NULL;
10489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491 Py_INCREF(self);
10492 return (PyObject*) self;
10493 }
10494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496 left = marg / 2 + (marg & width & 1);
10497
Victor Stinner9310abb2011-10-05 00:59:23 +020010498 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499}
10500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501/* This function assumes that str1 and str2 are readied by the caller. */
10502
Marc-André Lemburge5034372000-08-08 08:04:29 +000010503static int
10504unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 int kind1, kind2;
10507 void *data1, *data2;
10508 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010509
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 kind1 = PyUnicode_KIND(str1);
10511 kind2 = PyUnicode_KIND(str2);
10512 data1 = PyUnicode_DATA(str1);
10513 data2 = PyUnicode_DATA(str2);
10514 len1 = PyUnicode_GET_LENGTH(str1);
10515 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 for (i = 0; i < len1 && i < len2; ++i) {
10518 Py_UCS4 c1, c2;
10519 c1 = PyUnicode_READ(kind1, data1, i);
10520 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010521
10522 if (c1 != c2)
10523 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010524 }
10525
10526 return (len1 < len2) ? -1 : (len1 != len2);
10527}
10528
Alexander Belopolsky40018472011-02-26 01:02:56 +000010529int
10530PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10533 if (PyUnicode_READY(left) == -1 ||
10534 PyUnicode_READY(right) == -1)
10535 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010536 return unicode_compare((PyUnicodeObject *)left,
10537 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010539 PyErr_Format(PyExc_TypeError,
10540 "Can't compare %.100s and %.100s",
10541 left->ob_type->tp_name,
10542 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543 return -1;
10544}
10545
Martin v. Löwis5b222132007-06-10 09:51:05 +000010546int
10547PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 Py_ssize_t i;
10550 int kind;
10551 void *data;
10552 Py_UCS4 chr;
10553
Victor Stinner910337b2011-10-03 03:20:16 +020010554 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (PyUnicode_READY(uni) == -1)
10556 return -1;
10557 kind = PyUnicode_KIND(uni);
10558 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010559 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10561 if (chr != str[i])
10562 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010563 /* This check keeps Python strings that end in '\0' from comparing equal
10564 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010567 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010568 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010569 return 0;
10570}
10571
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010572
Benjamin Peterson29060642009-01-31 22:14:21 +000010573#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010574 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010575
Alexander Belopolsky40018472011-02-26 01:02:56 +000010576PyObject *
10577PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010578{
10579 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010581 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10582 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010583 if (PyUnicode_READY(left) == -1 ||
10584 PyUnicode_READY(right) == -1)
10585 return NULL;
10586 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10587 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010588 if (op == Py_EQ) {
10589 Py_INCREF(Py_False);
10590 return Py_False;
10591 }
10592 if (op == Py_NE) {
10593 Py_INCREF(Py_True);
10594 return Py_True;
10595 }
10596 }
10597 if (left == right)
10598 result = 0;
10599 else
10600 result = unicode_compare((PyUnicodeObject *)left,
10601 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010602
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010603 /* Convert the return value to a Boolean */
10604 switch (op) {
10605 case Py_EQ:
10606 v = TEST_COND(result == 0);
10607 break;
10608 case Py_NE:
10609 v = TEST_COND(result != 0);
10610 break;
10611 case Py_LE:
10612 v = TEST_COND(result <= 0);
10613 break;
10614 case Py_GE:
10615 v = TEST_COND(result >= 0);
10616 break;
10617 case Py_LT:
10618 v = TEST_COND(result == -1);
10619 break;
10620 case Py_GT:
10621 v = TEST_COND(result == 1);
10622 break;
10623 default:
10624 PyErr_BadArgument();
10625 return NULL;
10626 }
10627 Py_INCREF(v);
10628 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010629 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010630
Brian Curtindfc80e32011-08-10 20:28:54 -050010631 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010632}
10633
Alexander Belopolsky40018472011-02-26 01:02:56 +000010634int
10635PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010636{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010637 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 int kind1, kind2, kind;
10639 void *buf1, *buf2;
10640 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010641 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010642
10643 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010644 sub = PyUnicode_FromObject(element);
10645 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010646 PyErr_Format(PyExc_TypeError,
10647 "'in <string>' requires string as left operand, not %s",
10648 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010649 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010650 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (PyUnicode_READY(sub) == -1)
10652 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010653
Thomas Wouters477c8d52006-05-27 19:21:47 +000010654 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010655 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010656 Py_DECREF(sub);
10657 return -1;
10658 }
10659
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 kind1 = PyUnicode_KIND(str);
10661 kind2 = PyUnicode_KIND(sub);
10662 kind = kind1 > kind2 ? kind1 : kind2;
10663 buf1 = PyUnicode_DATA(str);
10664 buf2 = PyUnicode_DATA(sub);
10665 if (kind1 != kind)
10666 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10667 if (!buf1) {
10668 Py_DECREF(sub);
10669 return -1;
10670 }
10671 if (kind2 != kind)
10672 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10673 if (!buf2) {
10674 Py_DECREF(sub);
10675 if (kind1 != kind) PyMem_Free(buf1);
10676 return -1;
10677 }
10678 len1 = PyUnicode_GET_LENGTH(str);
10679 len2 = PyUnicode_GET_LENGTH(sub);
10680
10681 switch(kind) {
10682 case PyUnicode_1BYTE_KIND:
10683 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10684 break;
10685 case PyUnicode_2BYTE_KIND:
10686 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10687 break;
10688 case PyUnicode_4BYTE_KIND:
10689 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10690 break;
10691 default:
10692 result = -1;
10693 assert(0);
10694 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695
10696 Py_DECREF(str);
10697 Py_DECREF(sub);
10698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (kind1 != kind)
10700 PyMem_Free(buf1);
10701 if (kind2 != kind)
10702 PyMem_Free(buf2);
10703
Guido van Rossum403d68b2000-03-13 15:55:09 +000010704 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010705}
10706
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707/* Concat to string or Unicode object giving a new Unicode object. */
10708
Alexander Belopolsky40018472011-02-26 01:02:56 +000010709PyObject *
10710PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010713 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010718 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010720 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010724 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010727 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010728 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 }
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010734 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10735 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 w = PyUnicode_New(
10739 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10740 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010742 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010743 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10744 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 Py_DECREF(u);
10746 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010747 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
Benjamin Peterson29060642009-01-31 22:14:21 +000010750 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 Py_XDECREF(u);
10752 Py_XDECREF(v);
10753 return NULL;
10754}
10755
Victor Stinnerb0923652011-10-04 01:17:31 +020010756static void
10757unicode_append_inplace(PyObject **p_left, PyObject *right)
10758{
10759 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010760
10761 assert(PyUnicode_IS_READY(*p_left));
10762 assert(PyUnicode_IS_READY(right));
10763
10764 left_len = PyUnicode_GET_LENGTH(*p_left);
10765 right_len = PyUnicode_GET_LENGTH(right);
10766 if (left_len > PY_SSIZE_T_MAX - right_len) {
10767 PyErr_SetString(PyExc_OverflowError,
10768 "strings are too large to concat");
10769 goto error;
10770 }
10771 new_len = left_len + right_len;
10772
10773 /* Now we own the last reference to 'left', so we can resize it
10774 * in-place.
10775 */
10776 if (unicode_resize(p_left, new_len) != 0) {
10777 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10778 * deallocated so it cannot be put back into
10779 * 'variable'. The MemoryError is raised when there
10780 * is no value in 'variable', which might (very
10781 * remotely) be a cause of incompatibilities.
10782 */
10783 goto error;
10784 }
10785 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010786 copy_characters(*p_left, left_len, right, 0, right_len);
10787 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010788 return;
10789
10790error:
10791 Py_DECREF(*p_left);
10792 *p_left = NULL;
10793}
10794
Walter Dörwald1ab83302007-05-18 17:15:44 +000010795void
Victor Stinner23e56682011-10-03 03:54:37 +020010796PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010797{
Victor Stinner23e56682011-10-03 03:54:37 +020010798 PyObject *left, *res;
10799
10800 if (p_left == NULL) {
10801 if (!PyErr_Occurred())
10802 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010803 return;
10804 }
Victor Stinner23e56682011-10-03 03:54:37 +020010805 left = *p_left;
10806 if (right == NULL || !PyUnicode_Check(left)) {
10807 if (!PyErr_Occurred())
10808 PyErr_BadInternalCall();
10809 goto error;
10810 }
10811
Victor Stinnere1335c72011-10-04 20:53:03 +020010812 if (PyUnicode_READY(left))
10813 goto error;
10814 if (PyUnicode_READY(right))
10815 goto error;
10816
Victor Stinner23e56682011-10-03 03:54:37 +020010817 if (PyUnicode_CheckExact(left) && left != unicode_empty
10818 && PyUnicode_CheckExact(right) && right != unicode_empty
10819 && unicode_resizable(left)
10820 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10821 || _PyUnicode_WSTR(left) != NULL))
10822 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010823 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10824 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010825 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010826 not so different than duplicating the string. */
10827 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010828 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010829 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010830 if (p_left != NULL)
10831 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010832 return;
10833 }
10834 }
10835
10836 res = PyUnicode_Concat(left, right);
10837 if (res == NULL)
10838 goto error;
10839 Py_DECREF(left);
10840 *p_left = res;
10841 return;
10842
10843error:
10844 Py_DECREF(*p_left);
10845 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010846}
10847
10848void
10849PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10850{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010851 PyUnicode_Append(pleft, right);
10852 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010853}
10854
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010855PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010857\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010858Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010859string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010860interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861
10862static PyObject *
10863unicode_count(PyUnicodeObject *self, PyObject *args)
10864{
10865 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010866 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010867 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 int kind1, kind2, kind;
10870 void *buf1, *buf2;
10871 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872
Jesus Ceaac451502011-04-20 17:09:23 +020010873 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10874 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 kind1 = PyUnicode_KIND(self);
10878 kind2 = PyUnicode_KIND(substring);
10879 kind = kind1 > kind2 ? kind1 : kind2;
10880 buf1 = PyUnicode_DATA(self);
10881 buf2 = PyUnicode_DATA(substring);
10882 if (kind1 != kind)
10883 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10884 if (!buf1) {
10885 Py_DECREF(substring);
10886 return NULL;
10887 }
10888 if (kind2 != kind)
10889 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10890 if (!buf2) {
10891 Py_DECREF(substring);
10892 if (kind1 != kind) PyMem_Free(buf1);
10893 return NULL;
10894 }
10895 len1 = PyUnicode_GET_LENGTH(self);
10896 len2 = PyUnicode_GET_LENGTH(substring);
10897
10898 ADJUST_INDICES(start, end, len1);
10899 switch(kind) {
10900 case PyUnicode_1BYTE_KIND:
10901 iresult = ucs1lib_count(
10902 ((Py_UCS1*)buf1) + start, end - start,
10903 buf2, len2, PY_SSIZE_T_MAX
10904 );
10905 break;
10906 case PyUnicode_2BYTE_KIND:
10907 iresult = ucs2lib_count(
10908 ((Py_UCS2*)buf1) + start, end - start,
10909 buf2, len2, PY_SSIZE_T_MAX
10910 );
10911 break;
10912 case PyUnicode_4BYTE_KIND:
10913 iresult = ucs4lib_count(
10914 ((Py_UCS4*)buf1) + start, end - start,
10915 buf2, len2, PY_SSIZE_T_MAX
10916 );
10917 break;
10918 default:
10919 assert(0); iresult = 0;
10920 }
10921
10922 result = PyLong_FromSsize_t(iresult);
10923
10924 if (kind1 != kind)
10925 PyMem_Free(buf1);
10926 if (kind2 != kind)
10927 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928
10929 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010930
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931 return result;
10932}
10933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010934PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010935 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010937Encode S using the codec registered for encoding. Default encoding\n\
10938is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010939handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010940a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10941'xmlcharrefreplace' as well as any other name registered with\n\
10942codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010943
10944static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010945unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010947 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948 char *encoding = NULL;
10949 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010950
Benjamin Peterson308d6372009-09-18 21:42:35 +000010951 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10952 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010954 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010955}
10956
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010957PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010958 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010959\n\
10960Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963static PyObject*
10964unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10965{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010966 Py_ssize_t i, j, line_pos, src_len, incr;
10967 Py_UCS4 ch;
10968 PyObject *u;
10969 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010971 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010972 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010973
10974 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010975 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976
Antoine Pitrou22425222011-10-04 19:10:51 +020010977 if (PyUnicode_READY(self) == -1)
10978 return NULL;
10979
Thomas Wouters7e474022000-07-16 12:04:32 +000010980 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010981 src_len = PyUnicode_GET_LENGTH(self);
10982 i = j = line_pos = 0;
10983 kind = PyUnicode_KIND(self);
10984 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010985 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010986 for (; i < src_len; i++) {
10987 ch = PyUnicode_READ(kind, src_data, i);
10988 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010989 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010991 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010992 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010993 goto overflow;
10994 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010995 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010996 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011000 goto overflow;
11001 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011003 if (ch == '\n' || ch == '\r')
11004 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011006 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011007 if (!found && PyUnicode_CheckExact(self)) {
11008 Py_INCREF((PyObject *) self);
11009 return (PyObject *) self;
11010 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011011
Guido van Rossumd57fd912000-03-10 22:53:23 +000011012 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011013 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 if (!u)
11015 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011016 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017
Antoine Pitroue71d5742011-10-04 15:55:09 +020011018 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 for (; i < src_len; i++) {
11021 ch = PyUnicode_READ(kind, src_data, i);
11022 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011023 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 incr = tabsize - (line_pos % tabsize);
11025 line_pos += incr;
11026 while (incr--) {
11027 PyUnicode_WRITE(kind, dest_data, j, ' ');
11028 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011029 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011030 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011031 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011032 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011033 line_pos++;
11034 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011035 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011036 if (ch == '\n' || ch == '\r')
11037 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011038 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011039 }
11040 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011041#ifndef DONT_MAKE_RESULT_READY
11042 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 Py_DECREF(u);
11044 return NULL;
11045 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011046#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011047 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011048 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011049
Antoine Pitroue71d5742011-10-04 15:55:09 +020011050 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011051 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11052 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053}
11054
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011055PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011056 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057\n\
11058Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011059such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060arguments start and end are interpreted as in slice notation.\n\
11061\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011062Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063
11064static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011065unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011066{
Jesus Ceaac451502011-04-20 17:09:23 +020011067 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011068 Py_ssize_t start;
11069 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011070 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011071
Jesus Ceaac451502011-04-20 17:09:23 +020011072 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11073 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011074 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 if (PyUnicode_READY(self) == -1)
11077 return NULL;
11078 if (PyUnicode_READY(substring) == -1)
11079 return NULL;
11080
Victor Stinner794d5672011-10-10 03:21:36 +020011081 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011083 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (result == -2)
11088 return NULL;
11089
Christian Heimes217cfd12007-12-02 14:31:20 +000011090 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091}
11092
11093static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011094unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011096 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11097 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011099 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100}
11101
Guido van Rossumc2504932007-09-18 19:42:40 +000011102/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011103 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011104static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000011105unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011106{
Guido van Rossumc2504932007-09-18 19:42:40 +000011107 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011108 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (_PyUnicode_HASH(self) != -1)
11111 return _PyUnicode_HASH(self);
11112 if (PyUnicode_READY(self) == -1)
11113 return -1;
11114 len = PyUnicode_GET_LENGTH(self);
11115
11116 /* The hash function as a macro, gets expanded three times below. */
11117#define HASH(P) \
11118 x = (Py_uhash_t)*P << 7; \
11119 while (--len >= 0) \
11120 x = (1000003*x) ^ (Py_uhash_t)*P++;
11121
11122 switch (PyUnicode_KIND(self)) {
11123 case PyUnicode_1BYTE_KIND: {
11124 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11125 HASH(c);
11126 break;
11127 }
11128 case PyUnicode_2BYTE_KIND: {
11129 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11130 HASH(s);
11131 break;
11132 }
11133 default: {
11134 Py_UCS4 *l;
11135 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11136 "Impossible switch case in unicode_hash");
11137 l = PyUnicode_4BYTE_DATA(self);
11138 HASH(l);
11139 break;
11140 }
11141 }
11142 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11143
Guido van Rossumc2504932007-09-18 19:42:40 +000011144 if (x == -1)
11145 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011147 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011148}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011151PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011152 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011153\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011154Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011155
11156static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011157unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011159 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020011160 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011161 Py_ssize_t start;
11162 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163
Jesus Ceaac451502011-04-20 17:09:23 +020011164 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11165 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011168 if (PyUnicode_READY(self) == -1)
11169 return NULL;
11170 if (PyUnicode_READY(substring) == -1)
11171 return NULL;
11172
Victor Stinner794d5672011-10-10 03:21:36 +020011173 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011175 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011176
11177 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011179 if (result == -2)
11180 return NULL;
11181
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 if (result < 0) {
11183 PyErr_SetString(PyExc_ValueError, "substring not found");
11184 return NULL;
11185 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011186
Christian Heimes217cfd12007-12-02 14:31:20 +000011187 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188}
11189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011190PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011193Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011194at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195
11196static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011197unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011199 Py_ssize_t i, length;
11200 int kind;
11201 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202 int cased;
11203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011204 if (PyUnicode_READY(self) == -1)
11205 return NULL;
11206 length = PyUnicode_GET_LENGTH(self);
11207 kind = PyUnicode_KIND(self);
11208 data = PyUnicode_DATA(self);
11209
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (length == 1)
11212 return PyBool_FromLong(
11213 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011215 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011218
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 for (i = 0; i < length; i++) {
11221 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011222
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11224 return PyBool_FromLong(0);
11225 else if (!cased && Py_UNICODE_ISLOWER(ch))
11226 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011228 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229}
11230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011232 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011234Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011236
11237static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011238unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 Py_ssize_t i, length;
11241 int kind;
11242 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243 int cased;
11244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (PyUnicode_READY(self) == -1)
11246 return NULL;
11247 length = PyUnicode_GET_LENGTH(self);
11248 kind = PyUnicode_KIND(self);
11249 data = PyUnicode_DATA(self);
11250
Guido van Rossumd57fd912000-03-10 22:53:23 +000011251 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011252 if (length == 1)
11253 return PyBool_FromLong(
11254 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011256 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011259
Guido van Rossumd57fd912000-03-10 22:53:23 +000011260 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 for (i = 0; i < length; i++) {
11262 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011263
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11265 return PyBool_FromLong(0);
11266 else if (!cased && Py_UNICODE_ISUPPER(ch))
11267 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011268 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011269 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270}
11271
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011272PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011273 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011275Return True if S is a titlecased string and there is at least one\n\
11276character in S, i.e. upper- and titlecase characters may only\n\
11277follow uncased characters and lowercase characters only cased ones.\n\
11278Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279
11280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011281unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011283 Py_ssize_t i, length;
11284 int kind;
11285 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286 int cased, previous_is_cased;
11287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 if (PyUnicode_READY(self) == -1)
11289 return NULL;
11290 length = PyUnicode_GET_LENGTH(self);
11291 kind = PyUnicode_KIND(self);
11292 data = PyUnicode_DATA(self);
11293
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 if (length == 1) {
11296 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11297 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11298 (Py_UNICODE_ISUPPER(ch) != 0));
11299 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011301 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011302 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011304
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 cased = 0;
11306 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011307 for (i = 0; i < length; i++) {
11308 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011309
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11311 if (previous_is_cased)
11312 return PyBool_FromLong(0);
11313 previous_is_cased = 1;
11314 cased = 1;
11315 }
11316 else if (Py_UNICODE_ISLOWER(ch)) {
11317 if (!previous_is_cased)
11318 return PyBool_FromLong(0);
11319 previous_is_cased = 1;
11320 cased = 1;
11321 }
11322 else
11323 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011325 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326}
11327
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011328PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011329 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011331Return True if all characters in S are whitespace\n\
11332and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011333
11334static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011335unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 Py_ssize_t i, length;
11338 int kind;
11339 void *data;
11340
11341 if (PyUnicode_READY(self) == -1)
11342 return NULL;
11343 length = PyUnicode_GET_LENGTH(self);
11344 kind = PyUnicode_KIND(self);
11345 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 if (length == 1)
11349 return PyBool_FromLong(
11350 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011352 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011353 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011354 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 for (i = 0; i < length; i++) {
11357 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011358 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011359 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011360 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011361 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362}
11363
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011364PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011365 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011366\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011367Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011368and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011369
11370static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011371unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011373 Py_ssize_t i, length;
11374 int kind;
11375 void *data;
11376
11377 if (PyUnicode_READY(self) == -1)
11378 return NULL;
11379 length = PyUnicode_GET_LENGTH(self);
11380 kind = PyUnicode_KIND(self);
11381 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011382
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011383 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384 if (length == 1)
11385 return PyBool_FromLong(
11386 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387
11388 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011390 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 for (i = 0; i < length; i++) {
11393 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011396 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011397}
11398
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011399PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011400 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011401\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011402Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011404
11405static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011406unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011407{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011408 int kind;
11409 void *data;
11410 Py_ssize_t len, i;
11411
11412 if (PyUnicode_READY(self) == -1)
11413 return NULL;
11414
11415 kind = PyUnicode_KIND(self);
11416 data = PyUnicode_DATA(self);
11417 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011418
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011419 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011420 if (len == 1) {
11421 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11422 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11423 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011424
11425 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 for (i = 0; i < len; i++) {
11430 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011431 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011432 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011433 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011434 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011435}
11436
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011438 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011440Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442
11443static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011444unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t i, length;
11447 int kind;
11448 void *data;
11449
11450 if (PyUnicode_READY(self) == -1)
11451 return NULL;
11452 length = PyUnicode_GET_LENGTH(self);
11453 kind = PyUnicode_KIND(self);
11454 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (length == 1)
11458 return PyBool_FromLong(
11459 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011461 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011463 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 for (i = 0; i < length; i++) {
11466 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011469 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011470}
11471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011472PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011473 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011475Return True if all characters in S are digits\n\
11476and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477
11478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011479unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011481 Py_ssize_t i, length;
11482 int kind;
11483 void *data;
11484
11485 if (PyUnicode_READY(self) == -1)
11486 return NULL;
11487 length = PyUnicode_GET_LENGTH(self);
11488 kind = PyUnicode_KIND(self);
11489 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 if (length == 1) {
11493 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11494 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011497 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 for (i = 0; i < length; i++) {
11502 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011505 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506}
11507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011508PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011509 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011511Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011512False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011515unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011517 Py_ssize_t i, length;
11518 int kind;
11519 void *data;
11520
11521 if (PyUnicode_READY(self) == -1)
11522 return NULL;
11523 length = PyUnicode_GET_LENGTH(self);
11524 kind = PyUnicode_KIND(self);
11525 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 if (length == 1)
11529 return PyBool_FromLong(
11530 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011532 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011534 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 for (i = 0; i < length; i++) {
11537 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011540 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541}
11542
Martin v. Löwis47383402007-08-15 07:32:56 +000011543int
11544PyUnicode_IsIdentifier(PyObject *self)
11545{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 int kind;
11547 void *data;
11548 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011549 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (PyUnicode_READY(self) == -1) {
11552 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011554 }
11555
11556 /* Special case for empty strings */
11557 if (PyUnicode_GET_LENGTH(self) == 0)
11558 return 0;
11559 kind = PyUnicode_KIND(self);
11560 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011561
11562 /* PEP 3131 says that the first character must be in
11563 XID_Start and subsequent characters in XID_Continue,
11564 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011565 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011566 letters, digits, underscore). However, given the current
11567 definition of XID_Start and XID_Continue, it is sufficient
11568 to check just for these, except that _ must be allowed
11569 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011570 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011571 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011572 return 0;
11573
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011574 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011575 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011576 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011577 return 1;
11578}
11579
11580PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011582\n\
11583Return True if S is a valid identifier according\n\
11584to the language definition.");
11585
11586static PyObject*
11587unicode_isidentifier(PyObject *self)
11588{
11589 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11590}
11591
Georg Brandl559e5d72008-06-11 18:37:52 +000011592PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011594\n\
11595Return True if all characters in S are considered\n\
11596printable in repr() or S is empty, False otherwise.");
11597
11598static PyObject*
11599unicode_isprintable(PyObject *self)
11600{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 Py_ssize_t i, length;
11602 int kind;
11603 void *data;
11604
11605 if (PyUnicode_READY(self) == -1)
11606 return NULL;
11607 length = PyUnicode_GET_LENGTH(self);
11608 kind = PyUnicode_KIND(self);
11609 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011610
11611 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612 if (length == 1)
11613 return PyBool_FromLong(
11614 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 for (i = 0; i < length; i++) {
11617 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011618 Py_RETURN_FALSE;
11619 }
11620 }
11621 Py_RETURN_TRUE;
11622}
11623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011624PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011625 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626\n\
11627Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011628iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
11630static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011631unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011633 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
Martin v. Löwis18e16552006-02-15 17:27:45 +000011636static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637unicode_length(PyUnicodeObject *self)
11638{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 if (PyUnicode_READY(self) == -1)
11640 return -1;
11641 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642}
11643
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011644PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011645 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011647Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011648done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
11650static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011651unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011653 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 Py_UCS4 fillchar = ' ';
11655
11656 if (PyUnicode_READY(self) == -1)
11657 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011658
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011659 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660 return NULL;
11661
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663 Py_INCREF(self);
11664 return (PyObject*) self;
11665 }
11666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668}
11669
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011670PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011673Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011674
11675static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011676unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678 return fixup(self, fixlower);
11679}
11680
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011681#define LEFTSTRIP 0
11682#define RIGHTSTRIP 1
11683#define BOTHSTRIP 2
11684
11685/* Arrays indexed by above */
11686static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11687
11688#define STRIPNAME(i) (stripformat[i]+3)
11689
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690/* externally visible for str.strip(unicode) */
11691PyObject *
11692_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11693{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011694 void *data;
11695 int kind;
11696 Py_ssize_t i, j, len;
11697 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011698
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11700 return NULL;
11701
11702 kind = PyUnicode_KIND(self);
11703 data = PyUnicode_DATA(self);
11704 len = PyUnicode_GET_LENGTH(self);
11705 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11706 PyUnicode_DATA(sepobj),
11707 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Benjamin Peterson14339b62009-01-31 16:36:08 +000011709 i = 0;
11710 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 while (i < len &&
11712 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011713 i++;
11714 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011715 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011716
Benjamin Peterson14339b62009-01-31 16:36:08 +000011717 j = len;
11718 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011719 do {
11720 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011721 } while (j >= i &&
11722 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011724 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011725
Victor Stinner12bab6d2011-10-01 01:53:49 +020011726 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011727}
11728
11729PyObject*
11730PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11731{
11732 unsigned char *data;
11733 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011734 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011735
Victor Stinnerde636f32011-10-01 03:55:54 +020011736 if (PyUnicode_READY(self) == -1)
11737 return NULL;
11738
11739 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11740
Victor Stinner12bab6d2011-10-01 01:53:49 +020011741 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011742 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011743 if (PyUnicode_CheckExact(self)) {
11744 Py_INCREF(self);
11745 return self;
11746 }
11747 else
11748 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 }
11750
Victor Stinner12bab6d2011-10-01 01:53:49 +020011751 length = end - start;
11752 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011753 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754
Victor Stinnerde636f32011-10-01 03:55:54 +020011755 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011756 PyErr_SetString(PyExc_IndexError, "string index out of range");
11757 return NULL;
11758 }
11759
Victor Stinnerb9275c12011-10-05 14:01:42 +020011760 if (PyUnicode_IS_ASCII(self)) {
11761 kind = PyUnicode_KIND(self);
11762 data = PyUnicode_1BYTE_DATA(self);
11763 return unicode_fromascii(data + start, length);
11764 }
11765 else {
11766 kind = PyUnicode_KIND(self);
11767 data = PyUnicode_1BYTE_DATA(self);
11768 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011769 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011770 length);
11771 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
11774static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011775do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 int kind;
11778 void *data;
11779 Py_ssize_t len, i, j;
11780
11781 if (PyUnicode_READY(self) == -1)
11782 return NULL;
11783
11784 kind = PyUnicode_KIND(self);
11785 data = PyUnicode_DATA(self);
11786 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011787
Benjamin Peterson14339b62009-01-31 16:36:08 +000011788 i = 0;
11789 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011791 i++;
11792 }
11793 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011794
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 j = len;
11796 if (striptype != LEFTSTRIP) {
11797 do {
11798 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011800 j++;
11801 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011802
Victor Stinner12bab6d2011-10-01 01:53:49 +020011803 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804}
11805
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011806
11807static PyObject *
11808do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11809{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011810 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011811
Benjamin Peterson14339b62009-01-31 16:36:08 +000011812 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11813 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011814
Benjamin Peterson14339b62009-01-31 16:36:08 +000011815 if (sep != NULL && sep != Py_None) {
11816 if (PyUnicode_Check(sep))
11817 return _PyUnicode_XStrip(self, striptype, sep);
11818 else {
11819 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 "%s arg must be None or str",
11821 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 return NULL;
11823 }
11824 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011825
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011827}
11828
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011832\n\
11833Return a copy of the string S with leading and trailing\n\
11834whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011835If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836
11837static PyObject *
11838unicode_strip(PyUnicodeObject *self, PyObject *args)
11839{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011840 if (PyTuple_GET_SIZE(args) == 0)
11841 return do_strip(self, BOTHSTRIP); /* Common case */
11842 else
11843 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011844}
11845
11846
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011847PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011849\n\
11850Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011851If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011852
11853static PyObject *
11854unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11855{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011856 if (PyTuple_GET_SIZE(args) == 0)
11857 return do_strip(self, LEFTSTRIP); /* Common case */
11858 else
11859 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011860}
11861
11862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011863PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011865\n\
11866Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011867If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011868
11869static PyObject *
11870unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11871{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011872 if (PyTuple_GET_SIZE(args) == 0)
11873 return do_strip(self, RIGHTSTRIP); /* Common case */
11874 else
11875 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011876}
11877
11878
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011880unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881{
11882 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884
Georg Brandl222de0f2009-04-12 12:01:50 +000011885 if (len < 1) {
11886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011887 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
Tim Peters7a29bd52001-09-12 03:03:31 +000011890 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891 /* no repeat, return original string */
11892 Py_INCREF(str);
11893 return (PyObject*) str;
11894 }
Tim Peters8f422462000-09-09 06:13:41 +000011895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 if (PyUnicode_READY(str) == -1)
11897 return NULL;
11898
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011899 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011900 PyErr_SetString(PyExc_OverflowError,
11901 "repeated string is too long");
11902 return NULL;
11903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011904 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907 if (!u)
11908 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011909 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (PyUnicode_GET_LENGTH(str) == 1) {
11912 const int kind = PyUnicode_KIND(str);
11913 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11914 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011915 if (kind == PyUnicode_1BYTE_KIND)
11916 memset(to, (unsigned char)fill_char, len);
11917 else {
11918 for (n = 0; n < len; ++n)
11919 PyUnicode_WRITE(kind, to, n, fill_char);
11920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 }
11922 else {
11923 /* number of characters copied this far */
11924 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011925 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 char *to = (char *) PyUnicode_DATA(u);
11927 Py_MEMCPY(to, PyUnicode_DATA(str),
11928 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 n = (done <= nchars-done) ? done : nchars-done;
11931 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011932 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
11935
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011936 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 return (PyObject*) u;
11938}
11939
Alexander Belopolsky40018472011-02-26 01:02:56 +000011940PyObject *
11941PyUnicode_Replace(PyObject *obj,
11942 PyObject *subobj,
11943 PyObject *replobj,
11944 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945{
11946 PyObject *self;
11947 PyObject *str1;
11948 PyObject *str2;
11949 PyObject *result;
11950
11951 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011952 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011955 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011956 Py_DECREF(self);
11957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 }
11959 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011960 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011961 Py_DECREF(self);
11962 Py_DECREF(str1);
11963 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966 Py_DECREF(self);
11967 Py_DECREF(str1);
11968 Py_DECREF(str2);
11969 return result;
11970}
11971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011972PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011973 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974\n\
11975Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011976old replaced by new. If the optional argument count is\n\
11977given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978
11979static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011980unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 PyObject *str1;
11983 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011984 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985 PyObject *result;
11986
Martin v. Löwis18e16552006-02-15 17:27:45 +000011987 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 str1 = PyUnicode_FromObject(str1);
11992 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11993 return NULL;
11994 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011995 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 Py_DECREF(str1);
11997 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
12000 result = replace(self, str1, str2, maxcount);
12001
12002 Py_DECREF(str1);
12003 Py_DECREF(str2);
12004 return result;
12005}
12006
Alexander Belopolsky40018472011-02-26 01:02:56 +000012007static PyObject *
12008unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 Py_ssize_t isize;
12012 Py_ssize_t osize, squote, dquote, i, o;
12013 Py_UCS4 max, quote;
12014 int ikind, okind;
12015 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012018 return NULL;
12019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012020 isize = PyUnicode_GET_LENGTH(unicode);
12021 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 /* Compute length of output, quote characters, and
12024 maximum character */
12025 osize = 2; /* quotes */
12026 max = 127;
12027 squote = dquote = 0;
12028 ikind = PyUnicode_KIND(unicode);
12029 for (i = 0; i < isize; i++) {
12030 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12031 switch (ch) {
12032 case '\'': squote++; osize++; break;
12033 case '"': dquote++; osize++; break;
12034 case '\\': case '\t': case '\r': case '\n':
12035 osize += 2; break;
12036 default:
12037 /* Fast-path ASCII */
12038 if (ch < ' ' || ch == 0x7f)
12039 osize += 4; /* \xHH */
12040 else if (ch < 0x7f)
12041 osize++;
12042 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12043 osize++;
12044 max = ch > max ? ch : max;
12045 }
12046 else if (ch < 0x100)
12047 osize += 4; /* \xHH */
12048 else if (ch < 0x10000)
12049 osize += 6; /* \uHHHH */
12050 else
12051 osize += 10; /* \uHHHHHHHH */
12052 }
12053 }
12054
12055 quote = '\'';
12056 if (squote) {
12057 if (dquote)
12058 /* Both squote and dquote present. Use squote,
12059 and escape them */
12060 osize += squote;
12061 else
12062 quote = '"';
12063 }
12064
12065 repr = PyUnicode_New(osize, max);
12066 if (repr == NULL)
12067 return NULL;
12068 okind = PyUnicode_KIND(repr);
12069 odata = PyUnicode_DATA(repr);
12070
12071 PyUnicode_WRITE(okind, odata, 0, quote);
12072 PyUnicode_WRITE(okind, odata, osize-1, quote);
12073
12074 for (i = 0, o = 1; i < isize; i++) {
12075 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012076
12077 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if ((ch == quote) || (ch == '\\')) {
12079 PyUnicode_WRITE(okind, odata, o++, '\\');
12080 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012081 continue;
12082 }
12083
Benjamin Peterson29060642009-01-31 22:14:21 +000012084 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012085 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 PyUnicode_WRITE(okind, odata, o++, '\\');
12087 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012088 }
12089 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 PyUnicode_WRITE(okind, odata, o++, '\\');
12091 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012092 }
12093 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 PyUnicode_WRITE(okind, odata, o++, '\\');
12095 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012096 }
12097
12098 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012099 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 PyUnicode_WRITE(okind, odata, o++, '\\');
12101 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012102 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12103 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012104 }
12105
Georg Brandl559e5d72008-06-11 18:37:52 +000012106 /* Copy ASCII characters as-is */
12107 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012109 }
12110
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012112 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012114 (categories Z* and C* except ASCII space)
12115 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012117 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 if (ch <= 0xff) {
12119 PyUnicode_WRITE(okind, odata, o++, '\\');
12120 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012121 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12122 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012123 }
12124 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 else if (ch >= 0x10000) {
12126 PyUnicode_WRITE(okind, odata, o++, '\\');
12127 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012128 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12129 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12130 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12131 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012136 }
12137 /* Map 16-bit characters to '\uxxxx' */
12138 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012139 PyUnicode_WRITE(okind, odata, o++, '\\');
12140 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012141 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12142 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12143 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12144 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012145 }
12146 }
12147 /* Copy characters as-is */
12148 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012150 }
12151 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012154 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012155 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156}
12157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160\n\
12161Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012162such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163arguments start and end are interpreted as in slice notation.\n\
12164\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012165Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166
12167static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012168unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169{
Jesus Ceaac451502011-04-20 17:09:23 +020012170 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012171 Py_ssize_t start;
12172 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012173 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
Jesus Ceaac451502011-04-20 17:09:23 +020012175 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12176 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012177 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (PyUnicode_READY(self) == -1)
12180 return NULL;
12181 if (PyUnicode_READY(substring) == -1)
12182 return NULL;
12183
Victor Stinner794d5672011-10-10 03:21:36 +020012184 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012186 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
12188 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (result == -2)
12191 return NULL;
12192
Christian Heimes217cfd12007-12-02 14:31:20 +000012193 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012196PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012197 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012199Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012200
12201static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203{
Jesus Ceaac451502011-04-20 17:09:23 +020012204 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012205 Py_ssize_t start;
12206 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012207 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
Jesus Ceaac451502011-04-20 17:09:23 +020012209 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12210 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (PyUnicode_READY(self) == -1)
12214 return NULL;
12215 if (PyUnicode_READY(substring) == -1)
12216 return NULL;
12217
Victor Stinner794d5672011-10-10 03:21:36 +020012218 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012220 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221
12222 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (result == -2)
12225 return NULL;
12226
Guido van Rossumd57fd912000-03-10 22:53:23 +000012227 if (result < 0) {
12228 PyErr_SetString(PyExc_ValueError, "substring not found");
12229 return NULL;
12230 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231
Christian Heimes217cfd12007-12-02 14:31:20 +000012232 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233}
12234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012235PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012238Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012239done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240
12241static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012242unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012244 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 Py_UCS4 fillchar = ' ';
12246
Victor Stinnere9a29352011-10-01 02:14:59 +020012247 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012249
Victor Stinnere9a29352011-10-01 02:14:59 +020012250 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251 return NULL;
12252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254 Py_INCREF(self);
12255 return (PyObject*) self;
12256 }
12257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259}
12260
Alexander Belopolsky40018472011-02-26 01:02:56 +000012261PyObject *
12262PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263{
12264 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012265
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 s = PyUnicode_FromObject(s);
12267 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012268 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012269 if (sep != NULL) {
12270 sep = PyUnicode_FromObject(sep);
12271 if (sep == NULL) {
12272 Py_DECREF(s);
12273 return NULL;
12274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275 }
12276
Victor Stinner9310abb2011-10-05 00:59:23 +020012277 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
12279 Py_DECREF(s);
12280 Py_XDECREF(sep);
12281 return result;
12282}
12283
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012284PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286\n\
12287Return a list of the words in S, using sep as the\n\
12288delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012289splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012290whitespace string is a separator and empty strings are\n\
12291removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292
12293static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012294unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295{
12296 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012297 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298
Martin v. Löwis18e16552006-02-15 17:27:45 +000012299 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 return NULL;
12301
12302 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012305 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308}
12309
Thomas Wouters477c8d52006-05-27 19:21:47 +000012310PyObject *
12311PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12312{
12313 PyObject* str_obj;
12314 PyObject* sep_obj;
12315 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 int kind1, kind2, kind;
12317 void *buf1 = NULL, *buf2 = NULL;
12318 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012319
12320 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012321 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012325 Py_DECREF(str_obj);
12326 return NULL;
12327 }
12328
Victor Stinner14f8f022011-10-05 20:58:25 +020012329 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012331 kind = Py_MAX(kind1, kind2);
12332 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012334 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 if (!buf1)
12336 goto onError;
12337 buf2 = PyUnicode_DATA(sep_obj);
12338 if (kind2 != kind)
12339 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12340 if (!buf2)
12341 goto onError;
12342 len1 = PyUnicode_GET_LENGTH(str_obj);
12343 len2 = PyUnicode_GET_LENGTH(sep_obj);
12344
Victor Stinner14f8f022011-10-05 20:58:25 +020012345 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012347 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12348 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12349 else
12350 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 break;
12352 case PyUnicode_2BYTE_KIND:
12353 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12354 break;
12355 case PyUnicode_4BYTE_KIND:
12356 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12357 break;
12358 default:
12359 assert(0);
12360 out = 0;
12361 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012362
12363 Py_DECREF(sep_obj);
12364 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 if (kind1 != kind)
12366 PyMem_Free(buf1);
12367 if (kind2 != kind)
12368 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012369
12370 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012371 onError:
12372 Py_DECREF(sep_obj);
12373 Py_DECREF(str_obj);
12374 if (kind1 != kind && buf1)
12375 PyMem_Free(buf1);
12376 if (kind2 != kind && buf2)
12377 PyMem_Free(buf2);
12378 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012379}
12380
12381
12382PyObject *
12383PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12384{
12385 PyObject* str_obj;
12386 PyObject* sep_obj;
12387 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 int kind1, kind2, kind;
12389 void *buf1 = NULL, *buf2 = NULL;
12390 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012391
12392 str_obj = PyUnicode_FromObject(str_in);
12393 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395 sep_obj = PyUnicode_FromObject(sep_in);
12396 if (!sep_obj) {
12397 Py_DECREF(str_obj);
12398 return NULL;
12399 }
12400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401 kind1 = PyUnicode_KIND(str_in);
12402 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012403 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 buf1 = PyUnicode_DATA(str_in);
12405 if (kind1 != kind)
12406 buf1 = _PyUnicode_AsKind(str_in, kind);
12407 if (!buf1)
12408 goto onError;
12409 buf2 = PyUnicode_DATA(sep_obj);
12410 if (kind2 != kind)
12411 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12412 if (!buf2)
12413 goto onError;
12414 len1 = PyUnicode_GET_LENGTH(str_obj);
12415 len2 = PyUnicode_GET_LENGTH(sep_obj);
12416
12417 switch(PyUnicode_KIND(str_in)) {
12418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012419 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12420 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12421 else
12422 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012423 break;
12424 case PyUnicode_2BYTE_KIND:
12425 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12426 break;
12427 case PyUnicode_4BYTE_KIND:
12428 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12429 break;
12430 default:
12431 assert(0);
12432 out = 0;
12433 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012434
12435 Py_DECREF(sep_obj);
12436 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 if (kind1 != kind)
12438 PyMem_Free(buf1);
12439 if (kind2 != kind)
12440 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012441
12442 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 onError:
12444 Py_DECREF(sep_obj);
12445 Py_DECREF(str_obj);
12446 if (kind1 != kind && buf1)
12447 PyMem_Free(buf1);
12448 if (kind2 != kind && buf2)
12449 PyMem_Free(buf2);
12450 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012451}
12452
12453PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012454 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012455\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012456Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012457the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012458found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459
12460static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012461unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012462{
Victor Stinner9310abb2011-10-05 00:59:23 +020012463 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012464}
12465
12466PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012467 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012468\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012469Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012470the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012471separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012472
12473static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012474unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012475{
Victor Stinner9310abb2011-10-05 00:59:23 +020012476 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012477}
12478
Alexander Belopolsky40018472011-02-26 01:02:56 +000012479PyObject *
12480PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012481{
12482 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012483
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012484 s = PyUnicode_FromObject(s);
12485 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012486 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012487 if (sep != NULL) {
12488 sep = PyUnicode_FromObject(sep);
12489 if (sep == NULL) {
12490 Py_DECREF(s);
12491 return NULL;
12492 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012493 }
12494
Victor Stinner9310abb2011-10-05 00:59:23 +020012495 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012496
12497 Py_DECREF(s);
12498 Py_XDECREF(sep);
12499 return result;
12500}
12501
12502PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012504\n\
12505Return a list of the words in S, using sep as the\n\
12506delimiter string, starting at the end of the string and\n\
12507working to the front. If maxsplit is given, at most maxsplit\n\
12508splits are done. If sep is not specified, any whitespace string\n\
12509is a separator.");
12510
12511static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012512unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012513{
12514 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012515 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012516
Martin v. Löwis18e16552006-02-15 17:27:45 +000012517 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012518 return NULL;
12519
12520 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012522 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012523 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012524 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012525 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012526}
12527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012528PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012529 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530\n\
12531Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012532Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012533is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
12535static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012536unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012538 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012539 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012541 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12542 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543 return NULL;
12544
Guido van Rossum86662912000-04-11 15:38:46 +000012545 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546}
12547
12548static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012549PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550{
Walter Dörwald346737f2007-05-31 10:44:43 +000012551 if (PyUnicode_CheckExact(self)) {
12552 Py_INCREF(self);
12553 return self;
12554 } else
12555 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012556 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557}
12558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012559PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561\n\
12562Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012563and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012566unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568 return fixup(self, fixswapcase);
12569}
12570
Georg Brandlceee0772007-11-27 23:48:05 +000012571PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012572 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012573\n\
12574Return a translation table usable for str.translate().\n\
12575If there is only one argument, it must be a dictionary mapping Unicode\n\
12576ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012577Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012578If there are two arguments, they must be strings of equal length, and\n\
12579in the resulting dictionary, each character in x will be mapped to the\n\
12580character at the same position in y. If there is a third argument, it\n\
12581must be a string, whose characters will be mapped to None in the result.");
12582
12583static PyObject*
12584unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12585{
12586 PyObject *x, *y = NULL, *z = NULL;
12587 PyObject *new = NULL, *key, *value;
12588 Py_ssize_t i = 0;
12589 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590
Georg Brandlceee0772007-11-27 23:48:05 +000012591 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12592 return NULL;
12593 new = PyDict_New();
12594 if (!new)
12595 return NULL;
12596 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012597 int x_kind, y_kind, z_kind;
12598 void *x_data, *y_data, *z_data;
12599
Georg Brandlceee0772007-11-27 23:48:05 +000012600 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012601 if (!PyUnicode_Check(x)) {
12602 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12603 "be a string if there is a second argument");
12604 goto err;
12605 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012606 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012607 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12608 "arguments must have equal length");
12609 goto err;
12610 }
12611 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 x_kind = PyUnicode_KIND(x);
12613 y_kind = PyUnicode_KIND(y);
12614 x_data = PyUnicode_DATA(x);
12615 y_data = PyUnicode_DATA(y);
12616 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12617 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12618 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012619 if (!key || !value)
12620 goto err;
12621 res = PyDict_SetItem(new, key, value);
12622 Py_DECREF(key);
12623 Py_DECREF(value);
12624 if (res < 0)
12625 goto err;
12626 }
12627 /* create entries for deleting chars in z */
12628 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012629 z_kind = PyUnicode_KIND(z);
12630 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012631 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012633 if (!key)
12634 goto err;
12635 res = PyDict_SetItem(new, key, Py_None);
12636 Py_DECREF(key);
12637 if (res < 0)
12638 goto err;
12639 }
12640 }
12641 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 int kind;
12643 void *data;
12644
Georg Brandlceee0772007-11-27 23:48:05 +000012645 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012646 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012647 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12648 "to maketrans it must be a dict");
12649 goto err;
12650 }
12651 /* copy entries into the new dict, converting string keys to int keys */
12652 while (PyDict_Next(x, &i, &key, &value)) {
12653 if (PyUnicode_Check(key)) {
12654 /* convert string keys to integer keys */
12655 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012656 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012657 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12658 "table must be of length 1");
12659 goto err;
12660 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 kind = PyUnicode_KIND(key);
12662 data = PyUnicode_DATA(key);
12663 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012664 if (!newkey)
12665 goto err;
12666 res = PyDict_SetItem(new, newkey, value);
12667 Py_DECREF(newkey);
12668 if (res < 0)
12669 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012670 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012671 /* just keep integer keys */
12672 if (PyDict_SetItem(new, key, value) < 0)
12673 goto err;
12674 } else {
12675 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12676 "be strings or integers");
12677 goto err;
12678 }
12679 }
12680 }
12681 return new;
12682 err:
12683 Py_DECREF(new);
12684 return NULL;
12685}
12686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012687PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689\n\
12690Return a copy of the string S, where all characters have been mapped\n\
12691through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012692Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012693Unmapped characters are left untouched. Characters mapped to None\n\
12694are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012695
12696static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700}
12701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012702PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012705Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012706
12707static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012708unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710 return fixup(self, fixupper);
12711}
12712
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012713PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012714 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012715\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012716Pad a numeric string S with zeros on the left, to fill a field\n\
12717of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718
12719static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012720unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012722 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012723 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012724 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 int kind;
12726 void *data;
12727 Py_UCS4 chr;
12728
12729 if (PyUnicode_READY(self) == -1)
12730 return NULL;
12731
Martin v. Löwis18e16552006-02-15 17:27:45 +000012732 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012733 return NULL;
12734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012736 if (PyUnicode_CheckExact(self)) {
12737 Py_INCREF(self);
12738 return (PyObject*) self;
12739 }
12740 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012741 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742 }
12743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012744 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012745
12746 u = pad(self, fill, 0, '0');
12747
Walter Dörwald068325e2002-04-15 13:36:47 +000012748 if (u == NULL)
12749 return NULL;
12750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012751 kind = PyUnicode_KIND(u);
12752 data = PyUnicode_DATA(u);
12753 chr = PyUnicode_READ(kind, data, fill);
12754
12755 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012756 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 PyUnicode_WRITE(kind, data, 0, chr);
12758 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012759 }
12760
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012761 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762 return (PyObject*) u;
12763}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012766static PyObject *
12767unicode__decimal2ascii(PyObject *self)
12768{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012770}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771#endif
12772
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012773PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012776Return True if S starts with the specified prefix, False otherwise.\n\
12777With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012778With optional end, stop comparing S at that position.\n\
12779prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
12781static PyObject *
12782unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012785 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012787 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012788 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012789 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790
Jesus Ceaac451502011-04-20 17:09:23 +020012791 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012792 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012793 if (PyTuple_Check(subobj)) {
12794 Py_ssize_t i;
12795 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12796 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012798 if (substring == NULL)
12799 return NULL;
12800 result = tailmatch(self, substring, start, end, -1);
12801 Py_DECREF(substring);
12802 if (result) {
12803 Py_RETURN_TRUE;
12804 }
12805 }
12806 /* nothing matched */
12807 Py_RETURN_FALSE;
12808 }
12809 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012810 if (substring == NULL) {
12811 if (PyErr_ExceptionMatches(PyExc_TypeError))
12812 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12813 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012815 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012816 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012817 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012818 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819}
12820
12821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012822PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012825Return True if S ends with the specified suffix, False otherwise.\n\
12826With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012827With optional end, stop comparing S at that position.\n\
12828suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829
12830static PyObject *
12831unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012832 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012834 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012836 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012837 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012838 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839
Jesus Ceaac451502011-04-20 17:09:23 +020012840 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012841 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012842 if (PyTuple_Check(subobj)) {
12843 Py_ssize_t i;
12844 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12845 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012846 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012847 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012848 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012849 result = tailmatch(self, substring, start, end, +1);
12850 Py_DECREF(substring);
12851 if (result) {
12852 Py_RETURN_TRUE;
12853 }
12854 }
12855 Py_RETURN_FALSE;
12856 }
12857 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012858 if (substring == NULL) {
12859 if (PyErr_ExceptionMatches(PyExc_TypeError))
12860 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12861 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012863 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012864 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012866 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867}
12868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012870
12871PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012873\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012874Return a formatted version of S, using substitutions from args and kwargs.\n\
12875The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012876
Eric Smith27bbca62010-11-04 17:06:58 +000012877PyDoc_STRVAR(format_map__doc__,
12878 "S.format_map(mapping) -> str\n\
12879\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012880Return a formatted version of S, using substitutions from mapping.\n\
12881The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012882
Eric Smith4a7d76d2008-05-30 18:10:19 +000012883static PyObject *
12884unicode__format__(PyObject* self, PyObject* args)
12885{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012886 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012887
12888 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12889 return NULL;
12890
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012891 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012892 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012893 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012894}
12895
Eric Smith8c663262007-08-25 02:26:07 +000012896PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012897 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012898\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012899Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012900
12901static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012902unicode__sizeof__(PyUnicodeObject *v)
12903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 Py_ssize_t size;
12905
12906 /* If it's a compact object, account for base structure +
12907 character data. */
12908 if (PyUnicode_IS_COMPACT_ASCII(v))
12909 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12910 else if (PyUnicode_IS_COMPACT(v))
12911 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012912 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913 else {
12914 /* If it is a two-block object, account for base object, and
12915 for character block if present. */
12916 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012917 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012918 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012919 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 }
12921 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012922 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012923 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012925 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012926 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927
12928 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012929}
12930
12931PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012932 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012933
12934static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012935unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012936{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012937 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 if (!copy)
12939 return NULL;
12940 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012941}
12942
Guido van Rossumd57fd912000-03-10 22:53:23 +000012943static PyMethodDef unicode_methods[] = {
12944
12945 /* Order is according to common usage: often used methods should
12946 appear first, since lookup is done sequentially. */
12947
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012948 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012949 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12950 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012951 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012952 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12953 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12954 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12955 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12956 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12957 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12958 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012959 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012960 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12961 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12962 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012963 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012964 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12965 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12966 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012967 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012968 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012969 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012970 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012971 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12972 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12973 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12974 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12975 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12976 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12977 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12978 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12979 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12980 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12981 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12982 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12983 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12984 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012985 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012986 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012987 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012988 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012989 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012990 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012991 {"maketrans", (PyCFunction) unicode_maketrans,
12992 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012993 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012994#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012995 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996#endif
12997
12998#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012999 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013000 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001#endif
13002
Benjamin Peterson14339b62009-01-31 16:36:08 +000013003 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013004 {NULL, NULL}
13005};
13006
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013007static PyObject *
13008unicode_mod(PyObject *v, PyObject *w)
13009{
Brian Curtindfc80e32011-08-10 20:28:54 -050013010 if (!PyUnicode_Check(v))
13011 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013013}
13014
13015static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 0, /*nb_add*/
13017 0, /*nb_subtract*/
13018 0, /*nb_multiply*/
13019 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013020};
13021
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013023 (lenfunc) unicode_length, /* sq_length */
13024 PyUnicode_Concat, /* sq_concat */
13025 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13026 (ssizeargfunc) unicode_getitem, /* sq_item */
13027 0, /* sq_slice */
13028 0, /* sq_ass_item */
13029 0, /* sq_ass_slice */
13030 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031};
13032
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013033static PyObject*
13034unicode_subscript(PyUnicodeObject* self, PyObject* item)
13035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 if (PyUnicode_READY(self) == -1)
13037 return NULL;
13038
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013039 if (PyIndex_Check(item)) {
13040 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013041 if (i == -1 && PyErr_Occurred())
13042 return NULL;
13043 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020013045 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013046 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013047 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013048 PyObject *result;
13049 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013050 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013051 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013053 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013055 return NULL;
13056 }
13057
13058 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013059 return PyUnicode_New(0, 0);
13060 } else if (start == 0 && step == 1 &&
13061 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013062 PyUnicode_CheckExact(self)) {
13063 Py_INCREF(self);
13064 return (PyObject *)self;
13065 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020013066 return PyUnicode_Substring((PyObject*)self,
13067 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013068 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013069 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013070 src_kind = PyUnicode_KIND(self);
13071 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013072 if (!PyUnicode_IS_ASCII(self)) {
13073 kind_limit = kind_maxchar_limit(src_kind);
13074 max_char = 0;
13075 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13076 ch = PyUnicode_READ(src_kind, src_data, cur);
13077 if (ch > max_char) {
13078 max_char = ch;
13079 if (max_char >= kind_limit)
13080 break;
13081 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013082 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013083 }
Victor Stinner55c99112011-10-13 01:17:06 +020013084 else
13085 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013086 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013087 if (result == NULL)
13088 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013089 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013090 dest_data = PyUnicode_DATA(result);
13091
13092 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013093 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13094 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013095 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013096 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013097 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013098 } else {
13099 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13100 return NULL;
13101 }
13102}
13103
13104static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013105 (lenfunc)unicode_length, /* mp_length */
13106 (binaryfunc)unicode_subscript, /* mp_subscript */
13107 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013108};
13109
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111/* Helpers for PyUnicode_Format() */
13112
13113static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013114getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013116 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 (*p_argidx)++;
13119 if (arglen < 0)
13120 return args;
13121 else
13122 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013123 }
13124 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013125 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126 return NULL;
13127}
13128
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013129/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013131static PyObject *
13132formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013134 char *p;
13135 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013137
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138 x = PyFloat_AsDouble(v);
13139 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013140 return NULL;
13141
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013144
Eric Smith0923d1d2009-04-16 20:16:10 +000013145 p = PyOS_double_to_string(x, type, prec,
13146 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013147 if (p == NULL)
13148 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013150 PyMem_Free(p);
13151 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152}
13153
Tim Peters38fd5b62000-09-21 05:43:11 +000013154static PyObject*
13155formatlong(PyObject *val, int flags, int prec, int type)
13156{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013157 char *buf;
13158 int len;
13159 PyObject *str; /* temporary string object. */
13160 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013161
Benjamin Peterson14339b62009-01-31 16:36:08 +000013162 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13163 if (!str)
13164 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 Py_DECREF(str);
13167 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013168}
13169
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013170static Py_UCS4
13171formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013173 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013174 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013175 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013176 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 goto onError;
13179 }
13180 else {
13181 /* Integer input truncated to a character */
13182 long x;
13183 x = PyLong_AsLong(v);
13184 if (x == -1 && PyErr_Occurred())
13185 goto onError;
13186
13187 if (x < 0 || x > 0x10ffff) {
13188 PyErr_SetString(PyExc_OverflowError,
13189 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013190 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013191 }
13192
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013193 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013194 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013195
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013197 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013199 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013200}
13201
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013202static int
13203repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13204{
13205 int r;
13206 assert(count > 0);
13207 assert(PyUnicode_Check(obj));
13208 if (count > 5) {
13209 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
13210 if (repeated == NULL)
13211 return -1;
13212 r = _PyAccu_Accumulate(acc, repeated);
13213 Py_DECREF(repeated);
13214 return r;
13215 }
13216 else {
13217 do {
13218 if (_PyAccu_Accumulate(acc, obj))
13219 return -1;
13220 } while (--count);
13221 return 0;
13222 }
13223}
13224
Alexander Belopolsky40018472011-02-26 01:02:56 +000013225PyObject *
13226PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013227{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013228 void *fmt;
13229 int fmtkind;
13230 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013231 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013232 int r;
13233 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013235 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013236 PyObject *temp = NULL;
13237 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013238 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013239 _PyAccu acc;
13240 static PyObject *plus, *minus, *blank, *zero, *percent;
13241
13242 if (!plus && !(plus = get_latin1_char('+')))
13243 return NULL;
13244 if (!minus && !(minus = get_latin1_char('-')))
13245 return NULL;
13246 if (!blank && !(blank = get_latin1_char(' ')))
13247 return NULL;
13248 if (!zero && !(zero = get_latin1_char('0')))
13249 return NULL;
13250 if (!percent && !(percent = get_latin1_char('%')))
13251 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013252
Guido van Rossumd57fd912000-03-10 22:53:23 +000013253 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013254 PyErr_BadInternalCall();
13255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
13258 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013259 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013260 if (_PyAccu_Init(&acc))
13261 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013262 fmt = PyUnicode_DATA(uformat);
13263 fmtkind = PyUnicode_KIND(uformat);
13264 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13265 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013266
Guido van Rossumd57fd912000-03-10 22:53:23 +000013267 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013268 arglen = PyTuple_Size(args);
13269 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270 }
13271 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 arglen = -1;
13273 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013275 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013276 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278
13279 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013280 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013281 PyObject *nonfmt;
13282 Py_ssize_t nonfmtpos;
13283 nonfmtpos = fmtpos++;
13284 while (fmtcnt >= 0 &&
13285 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13286 fmtpos++;
13287 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013288 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013289 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
13290 if (nonfmt == NULL)
13291 goto onError;
13292 r = _PyAccu_Accumulate(&acc, nonfmt);
13293 Py_DECREF(nonfmt);
13294 if (r)
13295 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013296 }
13297 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013298 /* Got a format specifier */
13299 int flags = 0;
13300 Py_ssize_t width = -1;
13301 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013303 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013304 int isnumok;
13305 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013306 void *pbuf = NULL;
13307 Py_ssize_t pindex, len;
13308 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013310 fmtpos++;
13311 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13312 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013313 Py_ssize_t keylen;
13314 PyObject *key;
13315 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013316
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 if (dict == NULL) {
13318 PyErr_SetString(PyExc_TypeError,
13319 "format requires a mapping");
13320 goto onError;
13321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013323 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013324 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013325 /* Skip over balanced parentheses */
13326 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013328 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013330 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 if (fmtcnt < 0 || pcount > 0) {
13335 PyErr_SetString(PyExc_ValueError,
13336 "incomplete format key");
13337 goto onError;
13338 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020013339 key = PyUnicode_Substring((PyObject*)uformat,
13340 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013341 if (key == NULL)
13342 goto onError;
13343 if (args_owned) {
13344 Py_DECREF(args);
13345 args_owned = 0;
13346 }
13347 args = PyObject_GetItem(dict, key);
13348 Py_DECREF(key);
13349 if (args == NULL) {
13350 goto onError;
13351 }
13352 args_owned = 1;
13353 arglen = -1;
13354 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013355 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013358 case '-': flags |= F_LJUST; continue;
13359 case '+': flags |= F_SIGN; continue;
13360 case ' ': flags |= F_BLANK; continue;
13361 case '#': flags |= F_ALT; continue;
13362 case '0': flags |= F_ZERO; continue;
13363 }
13364 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013365 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013366 if (c == '*') {
13367 v = getnextarg(args, arglen, &argidx);
13368 if (v == NULL)
13369 goto onError;
13370 if (!PyLong_Check(v)) {
13371 PyErr_SetString(PyExc_TypeError,
13372 "* wants int");
13373 goto onError;
13374 }
13375 width = PyLong_AsLong(v);
13376 if (width == -1 && PyErr_Occurred())
13377 goto onError;
13378 if (width < 0) {
13379 flags |= F_LJUST;
13380 width = -width;
13381 }
13382 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 }
13385 else if (c >= '0' && c <= '9') {
13386 width = c - '0';
13387 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013388 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 if (c < '0' || c > '9')
13390 break;
13391 if ((width*10) / 10 != width) {
13392 PyErr_SetString(PyExc_ValueError,
13393 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013394 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 }
13396 width = width*10 + (c - '0');
13397 }
13398 }
13399 if (c == '.') {
13400 prec = 0;
13401 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013402 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013403 if (c == '*') {
13404 v = getnextarg(args, arglen, &argidx);
13405 if (v == NULL)
13406 goto onError;
13407 if (!PyLong_Check(v)) {
13408 PyErr_SetString(PyExc_TypeError,
13409 "* wants int");
13410 goto onError;
13411 }
13412 prec = PyLong_AsLong(v);
13413 if (prec == -1 && PyErr_Occurred())
13414 goto onError;
13415 if (prec < 0)
13416 prec = 0;
13417 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013418 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 }
13420 else if (c >= '0' && c <= '9') {
13421 prec = c - '0';
13422 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013423 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013424 if (c < '0' || c > '9')
13425 break;
13426 if ((prec*10) / 10 != prec) {
13427 PyErr_SetString(PyExc_ValueError,
13428 "prec too big");
13429 goto onError;
13430 }
13431 prec = prec*10 + (c - '0');
13432 }
13433 }
13434 } /* prec */
13435 if (fmtcnt >= 0) {
13436 if (c == 'h' || c == 'l' || c == 'L') {
13437 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013438 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 }
13440 }
13441 if (fmtcnt < 0) {
13442 PyErr_SetString(PyExc_ValueError,
13443 "incomplete format");
13444 goto onError;
13445 }
13446 if (c != '%') {
13447 v = getnextarg(args, arglen, &argidx);
13448 if (v == NULL)
13449 goto onError;
13450 }
13451 sign = 0;
13452 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013453 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013454 switch (c) {
13455
13456 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013457 _PyAccu_Accumulate(&acc, percent);
13458 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013459
13460 case 's':
13461 case 'r':
13462 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013463 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 temp = v;
13465 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013466 }
13467 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 if (c == 's')
13469 temp = PyObject_Str(v);
13470 else if (c == 'r')
13471 temp = PyObject_Repr(v);
13472 else
13473 temp = PyObject_ASCII(v);
13474 if (temp == NULL)
13475 goto onError;
13476 if (PyUnicode_Check(temp))
13477 /* nothing to do */;
13478 else {
13479 Py_DECREF(temp);
13480 PyErr_SetString(PyExc_TypeError,
13481 "%s argument has non-string str()");
13482 goto onError;
13483 }
13484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 if (PyUnicode_READY(temp) == -1) {
13486 Py_CLEAR(temp);
13487 goto onError;
13488 }
13489 pbuf = PyUnicode_DATA(temp);
13490 kind = PyUnicode_KIND(temp);
13491 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013492 if (prec >= 0 && len > prec)
13493 len = prec;
13494 break;
13495
13496 case 'i':
13497 case 'd':
13498 case 'u':
13499 case 'o':
13500 case 'x':
13501 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013502 isnumok = 0;
13503 if (PyNumber_Check(v)) {
13504 PyObject *iobj=NULL;
13505
13506 if (PyLong_Check(v)) {
13507 iobj = v;
13508 Py_INCREF(iobj);
13509 }
13510 else {
13511 iobj = PyNumber_Long(v);
13512 }
13513 if (iobj!=NULL) {
13514 if (PyLong_Check(iobj)) {
13515 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013516 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013517 Py_DECREF(iobj);
13518 if (!temp)
13519 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 if (PyUnicode_READY(temp) == -1) {
13521 Py_CLEAR(temp);
13522 goto onError;
13523 }
13524 pbuf = PyUnicode_DATA(temp);
13525 kind = PyUnicode_KIND(temp);
13526 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013527 sign = 1;
13528 }
13529 else {
13530 Py_DECREF(iobj);
13531 }
13532 }
13533 }
13534 if (!isnumok) {
13535 PyErr_Format(PyExc_TypeError,
13536 "%%%c format: a number is required, "
13537 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13538 goto onError;
13539 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013540 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013541 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013542 fillobj = zero;
13543 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013544 break;
13545
13546 case 'e':
13547 case 'E':
13548 case 'f':
13549 case 'F':
13550 case 'g':
13551 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013552 temp = formatfloat(v, flags, prec, c);
13553 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013554 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013555 if (PyUnicode_READY(temp) == -1) {
13556 Py_CLEAR(temp);
13557 goto onError;
13558 }
13559 pbuf = PyUnicode_DATA(temp);
13560 kind = PyUnicode_KIND(temp);
13561 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013563 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013564 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013565 fillobj = zero;
13566 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 break;
13568
13569 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013570 {
13571 Py_UCS4 ch = formatchar(v);
13572 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013573 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013574 temp = _PyUnicode_FromUCS4(&ch, 1);
13575 if (temp == NULL)
13576 goto onError;
13577 pbuf = PyUnicode_DATA(temp);
13578 kind = PyUnicode_KIND(temp);
13579 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013580 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013581 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013582
13583 default:
13584 PyErr_Format(PyExc_ValueError,
13585 "unsupported format character '%c' (0x%x) "
13586 "at index %zd",
13587 (31<=c && c<=126) ? (char)c : '?',
13588 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013589 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 goto onError;
13591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013592 /* pbuf is initialized here. */
13593 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013595 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13596 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013597 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013598 pindex++;
13599 }
13600 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13601 signobj = plus;
13602 len--;
13603 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013604 }
13605 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013606 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013607 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013608 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013609 else
13610 sign = 0;
13611 }
13612 if (width < len)
13613 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013614 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013615 if (fill != ' ') {
13616 assert(signobj != NULL);
13617 if (_PyAccu_Accumulate(&acc, signobj))
13618 goto onError;
13619 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013620 if (width > len)
13621 width--;
13622 }
13623 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013624 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013625 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013627 second = get_latin1_char(
13628 PyUnicode_READ(kind, pbuf, pindex + 1));
13629 pindex += 2;
13630 if (second == NULL ||
13631 _PyAccu_Accumulate(&acc, zero) ||
13632 _PyAccu_Accumulate(&acc, second))
13633 goto onError;
13634 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013635 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013636 width -= 2;
13637 if (width < 0)
13638 width = 0;
13639 len -= 2;
13640 }
13641 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013642 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013643 if (repeat_accumulate(&acc, fillobj, width - len))
13644 goto onError;
13645 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013646 }
13647 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013648 if (sign) {
13649 assert(signobj != NULL);
13650 if (_PyAccu_Accumulate(&acc, signobj))
13651 goto onError;
13652 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013653 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013654 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13655 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013656 second = get_latin1_char(
13657 PyUnicode_READ(kind, pbuf, pindex + 1));
13658 pindex += 2;
13659 if (second == NULL ||
13660 _PyAccu_Accumulate(&acc, zero) ||
13661 _PyAccu_Accumulate(&acc, second))
13662 goto onError;
13663 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013664 }
13665 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013667 if (temp != NULL) {
13668 assert(pbuf == PyUnicode_DATA(temp));
13669 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 else {
13672 const char *p = (const char *) pbuf;
13673 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013674 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013675 v = PyUnicode_FromKindAndData(kind, p, len);
13676 }
13677 if (v == NULL)
13678 goto onError;
13679 r = _PyAccu_Accumulate(&acc, v);
13680 Py_DECREF(v);
13681 if (r)
13682 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013683 if (width > len && repeat_accumulate(&acc, blank, width - len))
13684 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013685 if (dict && (argidx < arglen) && c != '%') {
13686 PyErr_SetString(PyExc_TypeError,
13687 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013688 goto onError;
13689 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013690 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013691 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692 } /* until end */
13693 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013694 PyErr_SetString(PyExc_TypeError,
13695 "not all arguments converted during string formatting");
13696 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013697 }
13698
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013699 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013700 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013701 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702 }
13703 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013704 Py_XDECREF(temp);
13705 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706 return (PyObject *)result;
13707
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013710 Py_XDECREF(temp);
13711 Py_XDECREF(second);
13712 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013713 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013714 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013715 }
13716 return NULL;
13717}
13718
Jeremy Hylton938ace62002-07-17 16:30:39 +000013719static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013720unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13721
Tim Peters6d6c1a32001-08-02 04:15:00 +000013722static PyObject *
13723unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13724{
Benjamin Peterson29060642009-01-31 22:14:21 +000013725 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013726 static char *kwlist[] = {"object", "encoding", "errors", 0};
13727 char *encoding = NULL;
13728 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013729
Benjamin Peterson14339b62009-01-31 16:36:08 +000013730 if (type != &PyUnicode_Type)
13731 return unicode_subtype_new(type, args, kwds);
13732 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013733 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013734 return NULL;
13735 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013736 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013737 if (encoding == NULL && errors == NULL)
13738 return PyObject_Str(x);
13739 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013740 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013741}
13742
Guido van Rossume023fe02001-08-30 03:12:59 +000013743static PyObject *
13744unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13745{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013746 PyUnicodeObject *unicode, *self;
13747 Py_ssize_t length, char_size;
13748 int share_wstr, share_utf8;
13749 unsigned int kind;
13750 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013751
Benjamin Peterson14339b62009-01-31 16:36:08 +000013752 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013753
13754 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13755 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013756 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013757 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013758 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013759 return NULL;
13760
13761 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13762 if (self == NULL) {
13763 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013764 return NULL;
13765 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013766 kind = PyUnicode_KIND(unicode);
13767 length = PyUnicode_GET_LENGTH(unicode);
13768
13769 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013770#ifdef Py_DEBUG
13771 _PyUnicode_HASH(self) = -1;
13772#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013773 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013774#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013775 _PyUnicode_STATE(self).interned = 0;
13776 _PyUnicode_STATE(self).kind = kind;
13777 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013778 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013779 _PyUnicode_STATE(self).ready = 1;
13780 _PyUnicode_WSTR(self) = NULL;
13781 _PyUnicode_UTF8_LENGTH(self) = 0;
13782 _PyUnicode_UTF8(self) = NULL;
13783 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013784 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013785
13786 share_utf8 = 0;
13787 share_wstr = 0;
13788 if (kind == PyUnicode_1BYTE_KIND) {
13789 char_size = 1;
13790 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13791 share_utf8 = 1;
13792 }
13793 else if (kind == PyUnicode_2BYTE_KIND) {
13794 char_size = 2;
13795 if (sizeof(wchar_t) == 2)
13796 share_wstr = 1;
13797 }
13798 else {
13799 assert(kind == PyUnicode_4BYTE_KIND);
13800 char_size = 4;
13801 if (sizeof(wchar_t) == 4)
13802 share_wstr = 1;
13803 }
13804
13805 /* Ensure we won't overflow the length. */
13806 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13807 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013809 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013810 data = PyObject_MALLOC((length + 1) * char_size);
13811 if (data == NULL) {
13812 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 goto onError;
13814 }
13815
Victor Stinnerc3c74152011-10-02 20:39:55 +020013816 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013817 if (share_utf8) {
13818 _PyUnicode_UTF8_LENGTH(self) = length;
13819 _PyUnicode_UTF8(self) = data;
13820 }
13821 if (share_wstr) {
13822 _PyUnicode_WSTR_LENGTH(self) = length;
13823 _PyUnicode_WSTR(self) = (wchar_t *)data;
13824 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013825
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013826 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013827 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013828 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013829#ifdef Py_DEBUG
13830 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13831#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013832 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013833 return (PyObject *)self;
13834
13835onError:
13836 Py_DECREF(unicode);
13837 Py_DECREF(self);
13838 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013839}
13840
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013841PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013842 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013843\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013844Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013845encoding defaults to the current default string encoding.\n\
13846errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013847
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013848static PyObject *unicode_iter(PyObject *seq);
13849
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013851 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013852 "str", /* tp_name */
13853 sizeof(PyUnicodeObject), /* tp_size */
13854 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 (destructor)unicode_dealloc, /* tp_dealloc */
13857 0, /* tp_print */
13858 0, /* tp_getattr */
13859 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013860 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 unicode_repr, /* tp_repr */
13862 &unicode_as_number, /* tp_as_number */
13863 &unicode_as_sequence, /* tp_as_sequence */
13864 &unicode_as_mapping, /* tp_as_mapping */
13865 (hashfunc) unicode_hash, /* tp_hash*/
13866 0, /* tp_call*/
13867 (reprfunc) unicode_str, /* tp_str */
13868 PyObject_GenericGetAttr, /* tp_getattro */
13869 0, /* tp_setattro */
13870 0, /* tp_as_buffer */
13871 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013872 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013873 unicode_doc, /* tp_doc */
13874 0, /* tp_traverse */
13875 0, /* tp_clear */
13876 PyUnicode_RichCompare, /* tp_richcompare */
13877 0, /* tp_weaklistoffset */
13878 unicode_iter, /* tp_iter */
13879 0, /* tp_iternext */
13880 unicode_methods, /* tp_methods */
13881 0, /* tp_members */
13882 0, /* tp_getset */
13883 &PyBaseObject_Type, /* tp_base */
13884 0, /* tp_dict */
13885 0, /* tp_descr_get */
13886 0, /* tp_descr_set */
13887 0, /* tp_dictoffset */
13888 0, /* tp_init */
13889 0, /* tp_alloc */
13890 unicode_new, /* tp_new */
13891 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013892};
13893
13894/* Initialize the Unicode implementation */
13895
Victor Stinner3a50e702011-10-18 21:21:00 +020013896int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013898 int i;
13899
Thomas Wouters477c8d52006-05-27 19:21:47 +000013900 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013902 0x000A, /* LINE FEED */
13903 0x000D, /* CARRIAGE RETURN */
13904 0x001C, /* FILE SEPARATOR */
13905 0x001D, /* GROUP SEPARATOR */
13906 0x001E, /* RECORD SEPARATOR */
13907 0x0085, /* NEXT LINE */
13908 0x2028, /* LINE SEPARATOR */
13909 0x2029, /* PARAGRAPH SEPARATOR */
13910 };
13911
Fred Drakee4315f52000-05-09 19:53:39 +000013912 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013913 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013914 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013915 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013916 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013917
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013918 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013919 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013920 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013921 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013922
13923 /* initialize the linebreak bloom filter */
13924 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013926 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013927
13928 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013929
13930#ifdef HAVE_MBCS
13931 winver.dwOSVersionInfoSize = sizeof(winver);
13932 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13933 PyErr_SetFromWindowsErr(0);
13934 return -1;
13935 }
13936#endif
13937 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013938}
13939
13940/* Finalize the Unicode implementation */
13941
Christian Heimesa156e092008-02-16 07:38:31 +000013942int
13943PyUnicode_ClearFreeList(void)
13944{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013945 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013946}
13947
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948void
Thomas Wouters78890102000-07-22 19:25:51 +000013949_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013951 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013952
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013953 Py_XDECREF(unicode_empty);
13954 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013955
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013956 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 if (unicode_latin1[i]) {
13958 Py_DECREF(unicode_latin1[i]);
13959 unicode_latin1[i] = NULL;
13960 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013961 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013962 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013963 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013964}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013965
Walter Dörwald16807132007-05-25 13:52:07 +000013966void
13967PyUnicode_InternInPlace(PyObject **p)
13968{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013969 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13970 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013971#ifdef Py_DEBUG
13972 assert(s != NULL);
13973 assert(_PyUnicode_CHECK(s));
13974#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013975 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013976 return;
13977#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 /* If it's a subclass, we don't really know what putting
13979 it in the interned dict might do. */
13980 if (!PyUnicode_CheckExact(s))
13981 return;
13982 if (PyUnicode_CHECK_INTERNED(s))
13983 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013984 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013985 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013986 return;
13987 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013988 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013989 if (interned == NULL) {
13990 interned = PyDict_New();
13991 if (interned == NULL) {
13992 PyErr_Clear(); /* Don't leave an exception */
13993 return;
13994 }
13995 }
13996 /* It might be that the GetItem call fails even
13997 though the key is present in the dictionary,
13998 namely when this happens during a stack overflow. */
13999 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000014000 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014001 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014002
Benjamin Peterson29060642009-01-31 22:14:21 +000014003 if (t) {
14004 Py_INCREF(t);
14005 Py_DECREF(*p);
14006 *p = t;
14007 return;
14008 }
Walter Dörwald16807132007-05-25 13:52:07 +000014009
Benjamin Peterson14339b62009-01-31 16:36:08 +000014010 PyThreadState_GET()->recursion_critical = 1;
14011 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
14012 PyErr_Clear();
14013 PyThreadState_GET()->recursion_critical = 0;
14014 return;
14015 }
14016 PyThreadState_GET()->recursion_critical = 0;
14017 /* The two references in interned are not counted by refcnt.
14018 The deallocator will take care of this */
14019 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014020 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014021}
14022
14023void
14024PyUnicode_InternImmortal(PyObject **p)
14025{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014026 PyUnicodeObject *u = (PyUnicodeObject *)*p;
14027
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 PyUnicode_InternInPlace(p);
14029 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014030 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014031 Py_INCREF(*p);
14032 }
Walter Dörwald16807132007-05-25 13:52:07 +000014033}
14034
14035PyObject *
14036PyUnicode_InternFromString(const char *cp)
14037{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014038 PyObject *s = PyUnicode_FromString(cp);
14039 if (s == NULL)
14040 return NULL;
14041 PyUnicode_InternInPlace(&s);
14042 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014043}
14044
Alexander Belopolsky40018472011-02-26 01:02:56 +000014045void
14046_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014047{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014048 PyObject *keys;
14049 PyUnicodeObject *s;
14050 Py_ssize_t i, n;
14051 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014052
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 if (interned == NULL || !PyDict_Check(interned))
14054 return;
14055 keys = PyDict_Keys(interned);
14056 if (keys == NULL || !PyList_Check(keys)) {
14057 PyErr_Clear();
14058 return;
14059 }
Walter Dörwald16807132007-05-25 13:52:07 +000014060
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14062 detector, interned unicode strings are not forcibly deallocated;
14063 rather, we give them their stolen references back, and then clear
14064 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014065
Benjamin Peterson14339b62009-01-31 16:36:08 +000014066 n = PyList_GET_SIZE(keys);
14067 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014068 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014069 for (i = 0; i < n; i++) {
14070 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014071 if (PyUnicode_READY(s) == -1) {
14072 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014073 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014074 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014075 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014076 case SSTATE_NOT_INTERNED:
14077 /* XXX Shouldn't happen */
14078 break;
14079 case SSTATE_INTERNED_IMMORTAL:
14080 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014081 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014082 break;
14083 case SSTATE_INTERNED_MORTAL:
14084 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014085 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 break;
14087 default:
14088 Py_FatalError("Inconsistent interned string state.");
14089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014090 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014091 }
14092 fprintf(stderr, "total size of all interned strings: "
14093 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14094 "mortal/immortal\n", mortal_size, immortal_size);
14095 Py_DECREF(keys);
14096 PyDict_Clear(interned);
14097 Py_DECREF(interned);
14098 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014099}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014100
14101
14102/********************* Unicode Iterator **************************/
14103
14104typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014105 PyObject_HEAD
14106 Py_ssize_t it_index;
14107 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014108} unicodeiterobject;
14109
14110static void
14111unicodeiter_dealloc(unicodeiterobject *it)
14112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 _PyObject_GC_UNTRACK(it);
14114 Py_XDECREF(it->it_seq);
14115 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014116}
14117
14118static int
14119unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014121 Py_VISIT(it->it_seq);
14122 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014123}
14124
14125static PyObject *
14126unicodeiter_next(unicodeiterobject *it)
14127{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014128 PyUnicodeObject *seq;
14129 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014130
Benjamin Peterson14339b62009-01-31 16:36:08 +000014131 assert(it != NULL);
14132 seq = it->it_seq;
14133 if (seq == NULL)
14134 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014135 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014137 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14138 int kind = PyUnicode_KIND(seq);
14139 void *data = PyUnicode_DATA(seq);
14140 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14141 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014142 if (item != NULL)
14143 ++it->it_index;
14144 return item;
14145 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014146
Benjamin Peterson14339b62009-01-31 16:36:08 +000014147 Py_DECREF(seq);
14148 it->it_seq = NULL;
14149 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014150}
14151
14152static PyObject *
14153unicodeiter_len(unicodeiterobject *it)
14154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014155 Py_ssize_t len = 0;
14156 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014157 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014158 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014159}
14160
14161PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14162
14163static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014165 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014167};
14168
14169PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14171 "str_iterator", /* tp_name */
14172 sizeof(unicodeiterobject), /* tp_basicsize */
14173 0, /* tp_itemsize */
14174 /* methods */
14175 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14176 0, /* tp_print */
14177 0, /* tp_getattr */
14178 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014179 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 0, /* tp_repr */
14181 0, /* tp_as_number */
14182 0, /* tp_as_sequence */
14183 0, /* tp_as_mapping */
14184 0, /* tp_hash */
14185 0, /* tp_call */
14186 0, /* tp_str */
14187 PyObject_GenericGetAttr, /* tp_getattro */
14188 0, /* tp_setattro */
14189 0, /* tp_as_buffer */
14190 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14191 0, /* tp_doc */
14192 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14193 0, /* tp_clear */
14194 0, /* tp_richcompare */
14195 0, /* tp_weaklistoffset */
14196 PyObject_SelfIter, /* tp_iter */
14197 (iternextfunc)unicodeiter_next, /* tp_iternext */
14198 unicodeiter_methods, /* tp_methods */
14199 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014200};
14201
14202static PyObject *
14203unicode_iter(PyObject *seq)
14204{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014205 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014206
Benjamin Peterson14339b62009-01-31 16:36:08 +000014207 if (!PyUnicode_Check(seq)) {
14208 PyErr_BadInternalCall();
14209 return NULL;
14210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014211 if (PyUnicode_READY(seq) == -1)
14212 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014213 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14214 if (it == NULL)
14215 return NULL;
14216 it->it_index = 0;
14217 Py_INCREF(seq);
14218 it->it_seq = (PyUnicodeObject *)seq;
14219 _PyObject_GC_TRACK(it);
14220 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014221}
14222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014223#define UNIOP(x) Py_UNICODE_##x
14224#define UNIOP_t Py_UNICODE
14225#include "uniops.h"
14226#undef UNIOP
14227#undef UNIOP_t
14228#define UNIOP(x) Py_UCS4_##x
14229#define UNIOP_t Py_UCS4
14230#include "uniops.h"
14231#undef UNIOP
14232#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000014233
Victor Stinner71133ff2010-09-01 23:43:53 +000014234Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000014235PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000014236{
14237 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020014238 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000014239 Py_ssize_t size;
14240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014241 if (!PyUnicode_Check(unicode)) {
14242 PyErr_BadArgument();
14243 return NULL;
14244 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014245 u = PyUnicode_AsUnicode(object);
14246 if (u == NULL)
14247 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014248 /* Ensure we won't overflow the size. */
14249 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14250 PyErr_NoMemory();
14251 return NULL;
14252 }
14253 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
14254 size *= sizeof(Py_UNICODE);
14255 copy = PyMem_Malloc(size);
14256 if (copy == NULL) {
14257 PyErr_NoMemory();
14258 return NULL;
14259 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014260 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014261 return copy;
14262}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014263
Georg Brandl66c221e2010-10-14 07:04:07 +000014264/* A _string module, to export formatter_parser and formatter_field_name_split
14265 to the string.Formatter class implemented in Python. */
14266
14267static PyMethodDef _string_methods[] = {
14268 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14269 METH_O, PyDoc_STR("split the argument as a field name")},
14270 {"formatter_parser", (PyCFunction) formatter_parser,
14271 METH_O, PyDoc_STR("parse the argument as a format string")},
14272 {NULL, NULL}
14273};
14274
14275static struct PyModuleDef _string_module = {
14276 PyModuleDef_HEAD_INIT,
14277 "_string",
14278 PyDoc_STR("string helper module"),
14279 0,
14280 _string_methods,
14281 NULL,
14282 NULL,
14283 NULL,
14284 NULL
14285};
14286
14287PyMODINIT_FUNC
14288PyInit__string(void)
14289{
14290 return PyModule_Create(&_string_module);
14291}
14292
14293
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014294#ifdef __cplusplus
14295}
14296#endif