blob: 5766237ed15735442ec19f25cdbad71ca1348edf [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Guido van Rossumd57fd912000-03-10 22:53:23 +0000522/* --- Unicode Object ----------------------------------------------------- */
523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200525fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200526
527Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
528 Py_ssize_t size, Py_UCS4 ch,
529 int direction)
530{
531 /* like wcschr, but doesn't stop at NULL characters */
532 Py_ssize_t i;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200533 if (kind == 1) {
534 if (direction == 1)
535 return memchr(s, ch, size);
536#ifdef HAVE_MEMRCHR
537 else
538 return memrchr(s, ch, size);
539#endif
540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200541 if (direction == 1) {
542 for(i = 0; i < size; i++)
543 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200544 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200545 }
546 else {
547 for(i = size-1; i >= 0; i--)
548 if (PyUnicode_READ(kind, s, i) == ch)
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200549 return (char*)s + kind * i;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200550 }
551 return NULL;
552}
553
Victor Stinnerfe226c02011-10-03 03:52:20 +0200554static PyObject*
555resize_compact(PyObject *unicode, Py_ssize_t length)
556{
557 Py_ssize_t char_size;
558 Py_ssize_t struct_size;
559 Py_ssize_t new_size;
560 int share_wstr;
561
562 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200563 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564 if (PyUnicode_IS_COMPACT_ASCII(unicode))
565 struct_size = sizeof(PyASCIIObject);
566 else
567 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200568 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569
570 _Py_DEC_REFTOTAL;
571 _Py_ForgetReference(unicode);
572
573 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
574 PyErr_NoMemory();
575 return NULL;
576 }
577 new_size = (struct_size + (length + 1) * char_size);
578
579 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
580 if (unicode == NULL) {
581 PyObject_Del(unicode);
582 PyErr_NoMemory();
583 return NULL;
584 }
585 _Py_NewReference(unicode);
586 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200587 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200588 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200589 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
590 _PyUnicode_WSTR_LENGTH(unicode) = length;
591 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200592 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
593 length, 0);
594 return unicode;
595}
596
Alexander Belopolsky40018472011-02-26 01:02:56 +0000597static int
Victor Stinner95663112011-10-04 01:03:50 +0200598resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000599{
Victor Stinner95663112011-10-04 01:03:50 +0200600 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200601 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200602 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000603
Victor Stinner95663112011-10-04 01:03:50 +0200604 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200605
606 if (PyUnicode_IS_READY(unicode)) {
607 Py_ssize_t char_size;
608 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200609 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200610 void *data;
611
612 data = _PyUnicode_DATA_ANY(unicode);
613 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200614 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200615 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
616 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200617 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
618 {
619 PyObject_DEL(_PyUnicode_UTF8(unicode));
620 _PyUnicode_UTF8(unicode) = NULL;
621 _PyUnicode_UTF8_LENGTH(unicode) = 0;
622 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200623
624 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
625 PyErr_NoMemory();
626 return -1;
627 }
628 new_size = (length + 1) * char_size;
629
630 data = (PyObject *)PyObject_REALLOC(data, new_size);
631 if (data == NULL) {
632 PyErr_NoMemory();
633 return -1;
634 }
635 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200636 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200637 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200638 _PyUnicode_WSTR_LENGTH(unicode) = length;
639 }
640 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200641 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200642 _PyUnicode_UTF8_LENGTH(unicode) = length;
643 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200644 _PyUnicode_LENGTH(unicode) = length;
645 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200646 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200647 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200648 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200649 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200650 }
Victor Stinner95663112011-10-04 01:03:50 +0200651 assert(_PyUnicode_WSTR(unicode) != NULL);
652
653 /* check for integer overflow */
654 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
655 PyErr_NoMemory();
656 return -1;
657 }
658 wstr = _PyUnicode_WSTR(unicode);
659 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
660 if (!wstr) {
661 PyErr_NoMemory();
662 return -1;
663 }
664 _PyUnicode_WSTR(unicode) = wstr;
665 _PyUnicode_WSTR(unicode)[length] = 0;
666 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200667 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668 return 0;
669}
670
Victor Stinnerfe226c02011-10-03 03:52:20 +0200671static PyObject*
672resize_copy(PyObject *unicode, Py_ssize_t length)
673{
674 Py_ssize_t copy_length;
675 if (PyUnicode_IS_COMPACT(unicode)) {
676 PyObject *copy;
677 assert(PyUnicode_IS_READY(unicode));
678
679 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
680 if (copy == NULL)
681 return NULL;
682
683 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200684 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200685 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200686 }
687 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200688 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200689 assert(_PyUnicode_WSTR(unicode) != NULL);
690 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200691 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200692 if (w == NULL)
693 return NULL;
694 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
695 copy_length = Py_MIN(copy_length, length);
696 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
697 copy_length);
698 return (PyObject*)w;
699 }
700}
701
Guido van Rossumd57fd912000-03-10 22:53:23 +0000702/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000703 Ux0000 terminated; some code (e.g. new_identifier)
704 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705
706 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000707 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708
709*/
710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200711#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200712static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200713#endif
714
Alexander Belopolsky40018472011-02-26 01:02:56 +0000715static PyUnicodeObject *
716_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717{
718 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200719 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720
Thomas Wouters477c8d52006-05-27 19:21:47 +0000721 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722 if (length == 0 && unicode_empty != NULL) {
723 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200724 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000725 }
726
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000727 /* Ensure we won't overflow the size. */
728 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
729 return (PyUnicodeObject *)PyErr_NoMemory();
730 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200731 if (length < 0) {
732 PyErr_SetString(PyExc_SystemError,
733 "Negative size passed to _PyUnicode_New");
734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 }
736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200737#ifdef Py_DEBUG
738 ++unicode_old_new_calls;
739#endif
740
741 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
742 if (unicode == NULL)
743 return NULL;
744 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
745 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
746 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000747 PyErr_NoMemory();
748 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000749 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750
Jeremy Hyltond8082792003-09-16 19:41:39 +0000751 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000752 * the caller fails before initializing str -- unicode_resize()
753 * reads str[0], and the Keep-Alive optimization can keep memory
754 * allocated for str alive across a call to unicode_dealloc(unicode).
755 * We don't want unicode_resize to read uninitialized memory in
756 * that case.
757 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200758 _PyUnicode_WSTR(unicode)[0] = 0;
759 _PyUnicode_WSTR(unicode)[length] = 0;
760 _PyUnicode_WSTR_LENGTH(unicode) = length;
761 _PyUnicode_HASH(unicode) = -1;
762 _PyUnicode_STATE(unicode).interned = 0;
763 _PyUnicode_STATE(unicode).kind = 0;
764 _PyUnicode_STATE(unicode).compact = 0;
765 _PyUnicode_STATE(unicode).ready = 0;
766 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200767 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200768 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200769 _PyUnicode_UTF8(unicode) = NULL;
770 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000772
Benjamin Peterson29060642009-01-31 22:14:21 +0000773 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000774 /* XXX UNREF/NEWREF interface should be more symmetrical */
775 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000776 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000777 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000779}
780
Victor Stinnerf42dc442011-10-02 23:33:16 +0200781static const char*
782unicode_kind_name(PyObject *unicode)
783{
Victor Stinner42dfd712011-10-03 14:41:45 +0200784 /* don't check consistency: unicode_kind_name() is called from
785 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200786 if (!PyUnicode_IS_COMPACT(unicode))
787 {
788 if (!PyUnicode_IS_READY(unicode))
789 return "wstr";
790 switch(PyUnicode_KIND(unicode))
791 {
792 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200793 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200794 return "legacy ascii";
795 else
796 return "legacy latin1";
797 case PyUnicode_2BYTE_KIND:
798 return "legacy UCS2";
799 case PyUnicode_4BYTE_KIND:
800 return "legacy UCS4";
801 default:
802 return "<legacy invalid kind>";
803 }
804 }
805 assert(PyUnicode_IS_READY(unicode));
806 switch(PyUnicode_KIND(unicode))
807 {
808 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200809 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200810 return "ascii";
811 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200812 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200813 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200814 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200815 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200816 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200817 default:
818 return "<invalid compact kind>";
819 }
820}
821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200822#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200823static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200824
825/* Functions wrapping macros for use in debugger */
826char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200827 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200828}
829
830void *_PyUnicode_compact_data(void *unicode) {
831 return _PyUnicode_COMPACT_DATA(unicode);
832}
833void *_PyUnicode_data(void *unicode){
834 printf("obj %p\n", unicode);
835 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
836 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
837 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
838 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
839 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
840 return PyUnicode_DATA(unicode);
841}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200842
843void
844_PyUnicode_Dump(PyObject *op)
845{
846 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200847 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
848 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
849 void *data;
850 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
851 if (ascii->state.compact)
852 data = (compact + 1);
853 else
854 data = unicode->data.any;
855 if (ascii->wstr == data)
856 printf("shared ");
857 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200858 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200859 printf(" (%zu), ", compact->wstr_length);
860 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
861 printf("shared ");
862 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200863 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200864 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200865}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200866#endif
867
868PyObject *
869PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
870{
871 PyObject *obj;
872 PyCompactUnicodeObject *unicode;
873 void *data;
874 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200875 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200876 Py_ssize_t char_size;
877 Py_ssize_t struct_size;
878
879 /* Optimization for empty strings */
880 if (size == 0 && unicode_empty != NULL) {
881 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200882 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 }
884
885#ifdef Py_DEBUG
886 ++unicode_new_new_calls;
887#endif
888
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 is_ascii = 0;
890 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200891 struct_size = sizeof(PyCompactUnicodeObject);
892 if (maxchar < 128) {
893 kind_state = PyUnicode_1BYTE_KIND;
894 char_size = 1;
895 is_ascii = 1;
896 struct_size = sizeof(PyASCIIObject);
897 }
898 else if (maxchar < 256) {
899 kind_state = PyUnicode_1BYTE_KIND;
900 char_size = 1;
901 }
902 else if (maxchar < 65536) {
903 kind_state = PyUnicode_2BYTE_KIND;
904 char_size = 2;
905 if (sizeof(wchar_t) == 2)
906 is_sharing = 1;
907 }
908 else {
909 kind_state = PyUnicode_4BYTE_KIND;
910 char_size = 4;
911 if (sizeof(wchar_t) == 4)
912 is_sharing = 1;
913 }
914
915 /* Ensure we won't overflow the size. */
916 if (size < 0) {
917 PyErr_SetString(PyExc_SystemError,
918 "Negative size passed to PyUnicode_New");
919 return NULL;
920 }
921 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
922 return PyErr_NoMemory();
923
924 /* Duplicated allocation code from _PyObject_New() instead of a call to
925 * PyObject_New() so we are able to allocate space for the object and
926 * it's data buffer.
927 */
928 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
929 if (obj == NULL)
930 return PyErr_NoMemory();
931 obj = PyObject_INIT(obj, &PyUnicode_Type);
932 if (obj == NULL)
933 return NULL;
934
935 unicode = (PyCompactUnicodeObject *)obj;
936 if (is_ascii)
937 data = ((PyASCIIObject*)obj) + 1;
938 else
939 data = unicode + 1;
940 _PyUnicode_LENGTH(unicode) = size;
941 _PyUnicode_HASH(unicode) = -1;
942 _PyUnicode_STATE(unicode).interned = 0;
943 _PyUnicode_STATE(unicode).kind = kind_state;
944 _PyUnicode_STATE(unicode).compact = 1;
945 _PyUnicode_STATE(unicode).ready = 1;
946 _PyUnicode_STATE(unicode).ascii = is_ascii;
947 if (is_ascii) {
948 ((char*)data)[size] = 0;
949 _PyUnicode_WSTR(unicode) = NULL;
950 }
951 else if (kind_state == PyUnicode_1BYTE_KIND) {
952 ((char*)data)[size] = 0;
953 _PyUnicode_WSTR(unicode) = NULL;
954 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200955 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200956 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957 }
958 else {
959 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200960 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 if (kind_state == PyUnicode_2BYTE_KIND)
962 ((Py_UCS2*)data)[size] = 0;
963 else /* kind_state == PyUnicode_4BYTE_KIND */
964 ((Py_UCS4*)data)[size] = 0;
965 if (is_sharing) {
966 _PyUnicode_WSTR_LENGTH(unicode) = size;
967 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
968 }
969 else {
970 _PyUnicode_WSTR_LENGTH(unicode) = 0;
971 _PyUnicode_WSTR(unicode) = NULL;
972 }
973 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200974 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 return obj;
976}
977
978#if SIZEOF_WCHAR_T == 2
979/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
980 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200981 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982
983 This function assumes that unicode can hold one more code point than wstr
984 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200985static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200986unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
987 PyUnicodeObject *unicode)
988{
989 const wchar_t *iter;
990 Py_UCS4 *ucs4_out;
991
Victor Stinner910337b2011-10-03 03:20:16 +0200992 assert(unicode != NULL);
993 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200994 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
995 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
996
997 for (iter = begin; iter < end; ) {
998 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
999 _PyUnicode_GET_LENGTH(unicode)));
1000 if (*iter >= 0xD800 && *iter <= 0xDBFF
1001 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1002 {
1003 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1004 iter += 2;
1005 }
1006 else {
1007 *ucs4_out++ = *iter;
1008 iter++;
1009 }
1010 }
1011 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1012 _PyUnicode_GET_LENGTH(unicode)));
1013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001014}
1015#endif
1016
Victor Stinnercd9950f2011-10-02 00:34:53 +02001017static int
1018_PyUnicode_Dirty(PyObject *unicode)
1019{
Victor Stinner910337b2011-10-03 03:20:16 +02001020 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001021 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001022 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001023 "Cannot modify a string having more than 1 reference");
1024 return -1;
1025 }
1026 _PyUnicode_DIRTY(unicode);
1027 return 0;
1028}
1029
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001030static int
1031_copy_characters(PyObject *to, Py_ssize_t to_start,
1032 PyObject *from, Py_ssize_t from_start,
1033 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001034{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001035 unsigned int from_kind, to_kind;
1036 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001037 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001039 assert(PyUnicode_Check(from));
1040 assert(PyUnicode_Check(to));
1041 assert(PyUnicode_IS_READY(from));
1042 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1045 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1046 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001048 if (how_many == 0)
1049 return 0;
1050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001052 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001053 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001054 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001055
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001056#ifdef Py_DEBUG
1057 if (!check_maxchar
1058 && (from_kind > to_kind
1059 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001060 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001061 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1062 Py_UCS4 ch;
1063 Py_ssize_t i;
1064 for (i=0; i < how_many; i++) {
1065 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1066 assert(ch <= to_maxchar);
1067 }
1068 }
1069#endif
1070 fast = (from_kind == to_kind);
1071 if (check_maxchar
1072 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1073 {
1074 /* deny latin1 => ascii */
1075 fast = 0;
1076 }
1077
1078 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001079 Py_MEMCPY((char*)to_data + to_kind * to_start,
1080 (char*)from_data + from_kind * from_start,
1081 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001083 else if (from_kind == PyUnicode_1BYTE_KIND
1084 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001085 {
1086 _PyUnicode_CONVERT_BYTES(
1087 Py_UCS1, Py_UCS2,
1088 PyUnicode_1BYTE_DATA(from) + from_start,
1089 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1090 PyUnicode_2BYTE_DATA(to) + to_start
1091 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001092 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001093 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001094 && to_kind == PyUnicode_4BYTE_KIND)
1095 {
1096 _PyUnicode_CONVERT_BYTES(
1097 Py_UCS1, Py_UCS4,
1098 PyUnicode_1BYTE_DATA(from) + from_start,
1099 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1100 PyUnicode_4BYTE_DATA(to) + to_start
1101 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001102 }
1103 else if (from_kind == PyUnicode_2BYTE_KIND
1104 && to_kind == PyUnicode_4BYTE_KIND)
1105 {
1106 _PyUnicode_CONVERT_BYTES(
1107 Py_UCS2, Py_UCS4,
1108 PyUnicode_2BYTE_DATA(from) + from_start,
1109 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1110 PyUnicode_4BYTE_DATA(to) + to_start
1111 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001112 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001113 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001114 /* check if max_char(from substring) <= max_char(to) */
1115 if (from_kind > to_kind
1116 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001117 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001118 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001119 /* slow path to check for character overflow */
1120 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001121 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001122 Py_ssize_t i;
1123
Victor Stinner56c161a2011-10-06 02:47:11 +02001124#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001125 for (i=0; i < how_many; i++) {
1126 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001127 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001128 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1129 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001130#else
1131 if (!check_maxchar) {
1132 for (i=0; i < how_many; i++) {
1133 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1134 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1135 }
1136 }
1137 else {
1138 for (i=0; i < how_many; i++) {
1139 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1140 if (ch > to_maxchar)
1141 return 1;
1142 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1143 }
1144 }
1145#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001146 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001148 assert(0 && "inconsistent state");
1149 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001150 }
1151 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001152 return 0;
1153}
1154
1155static void
1156copy_characters(PyObject *to, Py_ssize_t to_start,
1157 PyObject *from, Py_ssize_t from_start,
1158 Py_ssize_t how_many)
1159{
1160 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1161}
1162
1163Py_ssize_t
1164PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1165 PyObject *from, Py_ssize_t from_start,
1166 Py_ssize_t how_many)
1167{
1168 int err;
1169
1170 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1171 PyErr_BadInternalCall();
1172 return -1;
1173 }
1174
1175 if (PyUnicode_READY(from))
1176 return -1;
1177 if (PyUnicode_READY(to))
1178 return -1;
1179
1180 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1181 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1182 PyErr_Format(PyExc_SystemError,
1183 "Cannot write %zi characters at %zi "
1184 "in a string of %zi characters",
1185 how_many, to_start, PyUnicode_GET_LENGTH(to));
1186 return -1;
1187 }
1188
1189 if (how_many == 0)
1190 return 0;
1191
1192 if (_PyUnicode_Dirty(to))
1193 return -1;
1194
1195 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1196 if (err) {
1197 PyErr_Format(PyExc_SystemError,
1198 "Cannot copy %s characters "
1199 "into a string of %s characters",
1200 unicode_kind_name(from),
1201 unicode_kind_name(to));
1202 return -1;
1203 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001204 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001205}
1206
Victor Stinner17222162011-09-28 22:15:37 +02001207/* Find the maximum code point and count the number of surrogate pairs so a
1208 correct string length can be computed before converting a string to UCS4.
1209 This function counts single surrogates as a character and not as a pair.
1210
1211 Return 0 on success, or -1 on error. */
1212static int
1213find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1214 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001215{
1216 const wchar_t *iter;
1217
Victor Stinnerc53be962011-10-02 21:33:54 +02001218 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 *num_surrogates = 0;
1220 *maxchar = 0;
1221
1222 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001223 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001225#if SIZEOF_WCHAR_T != 2
1226 if (*maxchar >= 0x10000)
1227 return 0;
1228#endif
1229 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230#if SIZEOF_WCHAR_T == 2
1231 if (*iter >= 0xD800 && *iter <= 0xDBFF
1232 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1233 {
1234 Py_UCS4 surrogate_val;
1235 surrogate_val = (((iter[0] & 0x3FF)<<10)
1236 | (iter[1] & 0x3FF)) + 0x10000;
1237 ++(*num_surrogates);
1238 if (surrogate_val > *maxchar)
1239 *maxchar = surrogate_val;
1240 iter += 2;
1241 }
1242 else
1243 iter++;
1244#else
1245 iter++;
1246#endif
1247 }
1248 return 0;
1249}
1250
1251#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001252static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253#endif
1254
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001255static int
1256unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001258 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 wchar_t *end;
1260 Py_UCS4 maxchar = 0;
1261 Py_ssize_t num_surrogates;
1262#if SIZEOF_WCHAR_T == 2
1263 Py_ssize_t length_wo_surrogates;
1264#endif
1265
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001266 assert(p_obj != NULL);
1267 unicode = (PyUnicodeObject *)*p_obj;
1268
Georg Brandl7597add2011-10-05 16:36:47 +02001269 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001270 strings were created using _PyObject_New() and where no canonical
1271 representation (the str field) has been set yet aka strings
1272 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001273 assert(_PyUnicode_CHECK(unicode));
1274 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001275 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001276 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001277 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001278 /* Actually, it should neither be interned nor be anything else: */
1279 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001280
1281#ifdef Py_DEBUG
1282 ++unicode_ready_calls;
1283#endif
1284
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001285#ifdef Py_DEBUG
1286 assert(!replace || Py_REFCNT(unicode) == 1);
1287#else
1288 if (replace && Py_REFCNT(unicode) != 1)
1289 replace = 0;
1290#endif
1291 if (replace) {
1292 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1293 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1294 /* Optimization for empty strings */
1295 if (len == 0) {
1296 Py_INCREF(unicode_empty);
1297 Py_DECREF(*p_obj);
1298 *p_obj = unicode_empty;
1299 return 0;
1300 }
1301 if (len == 1 && wstr[0] < 256) {
1302 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1303 if (latin1_char == NULL)
1304 return -1;
1305 Py_DECREF(*p_obj);
1306 *p_obj = latin1_char;
1307 return 0;
1308 }
1309 }
1310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001312 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001313 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315
1316 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001317 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1318 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 PyErr_NoMemory();
1320 return -1;
1321 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001322 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 _PyUnicode_WSTR(unicode), end,
1324 PyUnicode_1BYTE_DATA(unicode));
1325 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1326 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1327 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1328 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001329 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001330 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001331 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 }
1333 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001334 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001335 _PyUnicode_UTF8(unicode) = NULL;
1336 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 }
1338 PyObject_FREE(_PyUnicode_WSTR(unicode));
1339 _PyUnicode_WSTR(unicode) = NULL;
1340 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1341 }
1342 /* In this case we might have to convert down from 4-byte native
1343 wchar_t to 2-byte unicode. */
1344 else if (maxchar < 65536) {
1345 assert(num_surrogates == 0 &&
1346 "FindMaxCharAndNumSurrogatePairs() messed up");
1347
Victor Stinner506f5922011-09-28 22:34:18 +02001348#if SIZEOF_WCHAR_T == 2
1349 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001350 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001351 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1352 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1353 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001354 _PyUnicode_UTF8(unicode) = NULL;
1355 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001356#else
1357 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001358 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001359 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001360 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001361 PyErr_NoMemory();
1362 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363 }
Victor Stinner506f5922011-09-28 22:34:18 +02001364 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1365 _PyUnicode_WSTR(unicode), end,
1366 PyUnicode_2BYTE_DATA(unicode));
1367 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1368 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1369 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001370 _PyUnicode_UTF8(unicode) = NULL;
1371 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001372 PyObject_FREE(_PyUnicode_WSTR(unicode));
1373 _PyUnicode_WSTR(unicode) = NULL;
1374 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1375#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 }
1377 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1378 else {
1379#if SIZEOF_WCHAR_T == 2
1380 /* in case the native representation is 2-bytes, we need to allocate a
1381 new normalized 4-byte version. */
1382 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001383 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1384 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 PyErr_NoMemory();
1386 return -1;
1387 }
1388 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1389 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001390 _PyUnicode_UTF8(unicode) = NULL;
1391 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001392 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1393 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001394 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 PyObject_FREE(_PyUnicode_WSTR(unicode));
1396 _PyUnicode_WSTR(unicode) = NULL;
1397 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1398#else
1399 assert(num_surrogates == 0);
1400
Victor Stinnerc3c74152011-10-02 20:39:55 +02001401 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001402 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001403 _PyUnicode_UTF8(unicode) = NULL;
1404 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1406#endif
1407 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1408 }
1409 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001410 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 return 0;
1412}
1413
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001414int
1415_PyUnicode_ReadyReplace(PyObject **op)
1416{
1417 return unicode_ready(op, 1);
1418}
1419
1420int
1421_PyUnicode_Ready(PyObject *op)
1422{
1423 return unicode_ready(&op, 0);
1424}
1425
Alexander Belopolsky40018472011-02-26 01:02:56 +00001426static void
1427unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428{
Walter Dörwald16807132007-05-25 13:52:07 +00001429 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001430 case SSTATE_NOT_INTERNED:
1431 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001432
Benjamin Peterson29060642009-01-31 22:14:21 +00001433 case SSTATE_INTERNED_MORTAL:
1434 /* revive dead object temporarily for DelItem */
1435 Py_REFCNT(unicode) = 3;
1436 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1437 Py_FatalError(
1438 "deletion of interned string failed");
1439 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001440
Benjamin Peterson29060642009-01-31 22:14:21 +00001441 case SSTATE_INTERNED_IMMORTAL:
1442 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001443
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 default:
1445 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001446 }
1447
Victor Stinner03490912011-10-03 23:45:12 +02001448 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001450 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001451 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
1453 if (PyUnicode_IS_COMPACT(unicode)) {
1454 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 }
1456 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001457 if (_PyUnicode_DATA_ANY(unicode))
1458 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001459 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460 }
1461}
1462
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001463#ifdef Py_DEBUG
1464static int
1465unicode_is_singleton(PyObject *unicode)
1466{
1467 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1468 if (unicode == unicode_empty)
1469 return 1;
1470 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1471 {
1472 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1473 if (ch < 256 && unicode_latin1[ch] == unicode)
1474 return 1;
1475 }
1476 return 0;
1477}
1478#endif
1479
Alexander Belopolsky40018472011-02-26 01:02:56 +00001480static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001481unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001482{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001483 if (Py_REFCNT(unicode) != 1)
1484 return 0;
1485 if (PyUnicode_CHECK_INTERNED(unicode))
1486 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001487#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001488 /* singleton refcount is greater than 1 */
1489 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001490#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001491 return 1;
1492}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001493
Victor Stinnerfe226c02011-10-03 03:52:20 +02001494static int
1495unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1496{
1497 PyObject *unicode;
1498 Py_ssize_t old_length;
1499
1500 assert(p_unicode != NULL);
1501 unicode = *p_unicode;
1502
1503 assert(unicode != NULL);
1504 assert(PyUnicode_Check(unicode));
1505 assert(0 <= length);
1506
Victor Stinner910337b2011-10-03 03:20:16 +02001507 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001508 old_length = PyUnicode_WSTR_LENGTH(unicode);
1509 else
1510 old_length = PyUnicode_GET_LENGTH(unicode);
1511 if (old_length == length)
1512 return 0;
1513
Victor Stinnerfe226c02011-10-03 03:52:20 +02001514 if (!unicode_resizable(unicode)) {
1515 PyObject *copy = resize_copy(unicode, length);
1516 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001517 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001518 Py_DECREF(*p_unicode);
1519 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001520 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001521 }
1522
Victor Stinnerfe226c02011-10-03 03:52:20 +02001523 if (PyUnicode_IS_COMPACT(unicode)) {
1524 *p_unicode = resize_compact(unicode, length);
1525 if (*p_unicode == NULL)
1526 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001527 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001529 }
1530 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001531}
1532
Alexander Belopolsky40018472011-02-26 01:02:56 +00001533int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001534PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001535{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001536 PyObject *unicode;
1537 if (p_unicode == NULL) {
1538 PyErr_BadInternalCall();
1539 return -1;
1540 }
1541 unicode = *p_unicode;
1542 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1543 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1544 {
1545 PyErr_BadInternalCall();
1546 return -1;
1547 }
1548 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001549}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551static PyObject*
1552get_latin1_char(unsigned char ch)
1553{
Victor Stinnera464fc12011-10-02 20:39:30 +02001554 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001555 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001556 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 if (!unicode)
1558 return NULL;
1559 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001560 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001561 unicode_latin1[ch] = unicode;
1562 }
1563 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001564 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565}
1566
Alexander Belopolsky40018472011-02-26 01:02:56 +00001567PyObject *
1568PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569{
1570 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 Py_UCS4 maxchar = 0;
1572 Py_ssize_t num_surrogates;
1573
1574 if (u == NULL)
1575 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001577 /* If the Unicode data is known at construction time, we can apply
1578 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001579
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 /* Optimization for empty strings */
1581 if (size == 0 && unicode_empty != NULL) {
1582 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001583 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001584 }
Tim Petersced69f82003-09-16 20:30:58 +00001585
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001586 /* Single character Unicode objects in the Latin-1 range are
1587 shared when using this constructor */
1588 if (size == 1 && *u < 256)
1589 return get_latin1_char((unsigned char)*u);
1590
1591 /* If not empty and not single character, copy the Unicode data
1592 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001593 if (find_maxchar_surrogates(u, u + size,
1594 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 return NULL;
1596
1597 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1598 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 if (!unicode)
1600 return NULL;
1601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 switch (PyUnicode_KIND(unicode)) {
1603 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001604 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001605 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1606 break;
1607 case PyUnicode_2BYTE_KIND:
1608#if Py_UNICODE_SIZE == 2
1609 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1610#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001611 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001612 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1613#endif
1614 break;
1615 case PyUnicode_4BYTE_KIND:
1616#if SIZEOF_WCHAR_T == 2
1617 /* This is the only case which has to process surrogates, thus
1618 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001619 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001620#else
1621 assert(num_surrogates == 0);
1622 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1623#endif
1624 break;
1625 default:
1626 assert(0 && "Impossible state");
1627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001629 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return (PyObject *)unicode;
1631}
1632
Alexander Belopolsky40018472011-02-26 01:02:56 +00001633PyObject *
1634PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001635{
1636 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001637
Benjamin Peterson14339b62009-01-31 16:36:08 +00001638 if (size < 0) {
1639 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001640 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001641 return NULL;
1642 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001643
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001644 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001645 some optimizations which share commonly used objects.
1646 Also, this means the input must be UTF-8, so fall back to the
1647 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001648 if (u != NULL) {
1649
Benjamin Peterson29060642009-01-31 22:14:21 +00001650 /* Optimization for empty strings */
1651 if (size == 0 && unicode_empty != NULL) {
1652 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001653 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001654 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001655
1656 /* Single characters are shared when using this constructor.
1657 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 if (size == 1 && Py_CHARMASK(*u) < 128)
1659 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001660
1661 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001662 }
1663
Walter Dörwald55507312007-05-18 13:12:10 +00001664 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001665 if (!unicode)
1666 return NULL;
1667
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001668 return (PyObject *)unicode;
1669}
1670
Alexander Belopolsky40018472011-02-26 01:02:56 +00001671PyObject *
1672PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001673{
1674 size_t size = strlen(u);
1675 if (size > PY_SSIZE_T_MAX) {
1676 PyErr_SetString(PyExc_OverflowError, "input too long");
1677 return NULL;
1678 }
1679
1680 return PyUnicode_FromStringAndSize(u, size);
1681}
1682
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001683PyObject *
1684_PyUnicode_FromId(_Py_Identifier *id)
1685{
1686 if (!id->object) {
1687 id->object = PyUnicode_FromString(id->string);
1688 if (!id->object)
1689 return NULL;
1690 PyUnicode_InternInPlace(&id->object);
1691 assert(!id->next);
1692 id->next = static_strings;
1693 static_strings = id;
1694 }
1695 Py_INCREF(id->object);
1696 return id->object;
1697}
1698
1699void
1700_PyUnicode_ClearStaticStrings()
1701{
1702 _Py_Identifier *i;
1703 for (i = static_strings; i; i = i->next) {
1704 Py_DECREF(i->object);
1705 i->object = NULL;
1706 i->next = NULL;
1707 }
1708}
1709
Victor Stinnere57b1c02011-09-28 22:20:48 +02001710static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001711unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001712{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001713 PyObject *res;
1714#ifdef Py_DEBUG
1715 const unsigned char *p;
1716 const unsigned char *end = s + size;
1717 for (p=s; p < end; p++) {
1718 assert(*p < 128);
1719 }
1720#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001721 if (size == 1)
1722 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001723 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001724 if (!res)
1725 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001726 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001727 return res;
1728}
1729
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001730static Py_UCS4
1731kind_maxchar_limit(unsigned int kind)
1732{
1733 switch(kind) {
1734 case PyUnicode_1BYTE_KIND:
1735 return 0x80;
1736 case PyUnicode_2BYTE_KIND:
1737 return 0x100;
1738 case PyUnicode_4BYTE_KIND:
1739 return 0x10000;
1740 default:
1741 assert(0 && "invalid kind");
1742 return 0x10ffff;
1743 }
1744}
1745
Victor Stinner702c7342011-10-05 13:50:52 +02001746static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001747_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001748{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001750 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001751
1752 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001753 if (size == 1)
1754 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001755 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001756 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757 if (!res)
1758 return NULL;
1759 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001760 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001762}
1763
Victor Stinnere57b1c02011-09-28 22:20:48 +02001764static PyObject*
1765_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766{
1767 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001768 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001769
1770 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001771 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001772 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001773 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001774 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 if (!res)
1776 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001777 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001779 else {
1780 _PyUnicode_CONVERT_BYTES(
1781 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1782 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001783 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 return res;
1785}
1786
Victor Stinnere57b1c02011-09-28 22:20:48 +02001787static PyObject*
1788_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789{
1790 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001791 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001792
1793 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001794 if (size == 1 && u[0] < 256)
1795 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001796 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001797 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 if (!res)
1799 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001800 if (max_char < 256)
1801 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1802 PyUnicode_1BYTE_DATA(res));
1803 else if (max_char < 0x10000)
1804 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1805 PyUnicode_2BYTE_DATA(res));
1806 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001808 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 return res;
1810}
1811
1812PyObject*
1813PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1814{
1815 switch(kind) {
1816 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001817 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001819 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001821 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001822 default:
1823 assert(0 && "invalid kind");
1824 PyErr_SetString(PyExc_SystemError, "invalid kind");
1825 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827}
1828
Victor Stinner25a4b292011-10-06 12:31:55 +02001829/* Ensure that a string uses the most efficient storage, if it is not the
1830 case: create a new string with of the right kind. Write NULL into *p_unicode
1831 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001832static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001833unicode_adjust_maxchar(PyObject **p_unicode)
1834{
1835 PyObject *unicode, *copy;
1836 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001837 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001838 unsigned int kind;
1839
1840 assert(p_unicode != NULL);
1841 unicode = *p_unicode;
1842 assert(PyUnicode_IS_READY(unicode));
1843 if (PyUnicode_IS_ASCII(unicode))
1844 return;
1845
1846 len = PyUnicode_GET_LENGTH(unicode);
1847 kind = PyUnicode_KIND(unicode);
1848 if (kind == PyUnicode_1BYTE_KIND) {
1849 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001850 max_char = ucs1lib_find_max_char(u, u + len);
1851 if (max_char >= 128)
1852 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001853 }
1854 else if (kind == PyUnicode_2BYTE_KIND) {
1855 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001856 max_char = ucs2lib_find_max_char(u, u + len);
1857 if (max_char >= 256)
1858 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001859 }
1860 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001861 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001862 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001863 max_char = ucs4lib_find_max_char(u, u + len);
1864 if (max_char >= 0x10000)
1865 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001866 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001867 copy = PyUnicode_New(len, max_char);
1868 copy_characters(copy, 0, unicode, 0, len);
1869 Py_DECREF(unicode);
1870 *p_unicode = copy;
1871}
1872
Victor Stinner034f6cf2011-09-30 02:26:44 +02001873PyObject*
1874PyUnicode_Copy(PyObject *unicode)
1875{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001876 Py_ssize_t size;
1877 PyObject *copy;
1878 void *data;
1879
Victor Stinner034f6cf2011-09-30 02:26:44 +02001880 if (!PyUnicode_Check(unicode)) {
1881 PyErr_BadInternalCall();
1882 return NULL;
1883 }
1884 if (PyUnicode_READY(unicode))
1885 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001886
1887 size = PyUnicode_GET_LENGTH(unicode);
1888 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1889 if (!copy)
1890 return NULL;
1891 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1892
1893 data = PyUnicode_DATA(unicode);
1894 switch (PyUnicode_KIND(unicode))
1895 {
1896 case PyUnicode_1BYTE_KIND:
1897 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1898 break;
1899 case PyUnicode_2BYTE_KIND:
1900 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1901 break;
1902 case PyUnicode_4BYTE_KIND:
1903 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1904 break;
1905 default:
1906 assert(0);
1907 break;
1908 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001909 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001910 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001911}
1912
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913
Victor Stinnerbc603d12011-10-02 01:00:40 +02001914/* Widen Unicode objects to larger buffers. Don't write terminating null
1915 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916
1917void*
1918_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1919{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001920 Py_ssize_t len;
1921 void *result;
1922 unsigned int skind;
1923
1924 if (PyUnicode_READY(s))
1925 return NULL;
1926
1927 len = PyUnicode_GET_LENGTH(s);
1928 skind = PyUnicode_KIND(s);
1929 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001930 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001931 return NULL;
1932 }
1933 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001934 case PyUnicode_2BYTE_KIND:
1935 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1936 if (!result)
1937 return PyErr_NoMemory();
1938 assert(skind == PyUnicode_1BYTE_KIND);
1939 _PyUnicode_CONVERT_BYTES(
1940 Py_UCS1, Py_UCS2,
1941 PyUnicode_1BYTE_DATA(s),
1942 PyUnicode_1BYTE_DATA(s) + len,
1943 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001944 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001945 case PyUnicode_4BYTE_KIND:
1946 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1947 if (!result)
1948 return PyErr_NoMemory();
1949 if (skind == PyUnicode_2BYTE_KIND) {
1950 _PyUnicode_CONVERT_BYTES(
1951 Py_UCS2, Py_UCS4,
1952 PyUnicode_2BYTE_DATA(s),
1953 PyUnicode_2BYTE_DATA(s) + len,
1954 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001955 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001956 else {
1957 assert(skind == PyUnicode_1BYTE_KIND);
1958 _PyUnicode_CONVERT_BYTES(
1959 Py_UCS1, Py_UCS4,
1960 PyUnicode_1BYTE_DATA(s),
1961 PyUnicode_1BYTE_DATA(s) + len,
1962 result);
1963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001965 default:
1966 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001967 }
Victor Stinner01698042011-10-04 00:04:26 +02001968 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 return NULL;
1970}
1971
1972static Py_UCS4*
1973as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1974 int copy_null)
1975{
1976 int kind;
1977 void *data;
1978 Py_ssize_t len, targetlen;
1979 if (PyUnicode_READY(string) == -1)
1980 return NULL;
1981 kind = PyUnicode_KIND(string);
1982 data = PyUnicode_DATA(string);
1983 len = PyUnicode_GET_LENGTH(string);
1984 targetlen = len;
1985 if (copy_null)
1986 targetlen++;
1987 if (!target) {
1988 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1989 PyErr_NoMemory();
1990 return NULL;
1991 }
1992 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1993 if (!target) {
1994 PyErr_NoMemory();
1995 return NULL;
1996 }
1997 }
1998 else {
1999 if (targetsize < targetlen) {
2000 PyErr_Format(PyExc_SystemError,
2001 "string is longer than the buffer");
2002 if (copy_null && 0 < targetsize)
2003 target[0] = 0;
2004 return NULL;
2005 }
2006 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002007 if (kind == PyUnicode_1BYTE_KIND) {
2008 Py_UCS1 *start = (Py_UCS1 *) data;
2009 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002010 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002011 else if (kind == PyUnicode_2BYTE_KIND) {
2012 Py_UCS2 *start = (Py_UCS2 *) data;
2013 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2014 }
2015 else {
2016 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002017 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 if (copy_null)
2020 target[len] = 0;
2021 return target;
2022}
2023
2024Py_UCS4*
2025PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2026 int copy_null)
2027{
2028 if (target == NULL || targetsize < 1) {
2029 PyErr_BadInternalCall();
2030 return NULL;
2031 }
2032 return as_ucs4(string, target, targetsize, copy_null);
2033}
2034
2035Py_UCS4*
2036PyUnicode_AsUCS4Copy(PyObject *string)
2037{
2038 return as_ucs4(string, NULL, 0, 1);
2039}
2040
2041#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002042
Alexander Belopolsky40018472011-02-26 01:02:56 +00002043PyObject *
2044PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002047 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002049 PyErr_BadInternalCall();
2050 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
2052
Martin v. Löwis790465f2008-04-05 20:41:37 +00002053 if (size == -1) {
2054 size = wcslen(w);
2055 }
2056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058}
2059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002061
Walter Dörwald346737f2007-05-31 10:44:43 +00002062static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002063makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2064 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002065{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002066 *fmt++ = '%';
2067 if (width) {
2068 if (zeropad)
2069 *fmt++ = '0';
2070 fmt += sprintf(fmt, "%d", width);
2071 }
2072 if (precision)
2073 fmt += sprintf(fmt, ".%d", precision);
2074 if (longflag)
2075 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002076 else if (longlongflag) {
2077 /* longlongflag should only ever be nonzero on machines with
2078 HAVE_LONG_LONG defined */
2079#ifdef HAVE_LONG_LONG
2080 char *f = PY_FORMAT_LONG_LONG;
2081 while (*f)
2082 *fmt++ = *f++;
2083#else
2084 /* we shouldn't ever get here */
2085 assert(0);
2086 *fmt++ = 'l';
2087#endif
2088 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002089 else if (size_tflag) {
2090 char *f = PY_FORMAT_SIZE_T;
2091 while (*f)
2092 *fmt++ = *f++;
2093 }
2094 *fmt++ = c;
2095 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002096}
2097
Victor Stinner96865452011-03-01 23:44:09 +00002098/* helper for PyUnicode_FromFormatV() */
2099
2100static const char*
2101parse_format_flags(const char *f,
2102 int *p_width, int *p_precision,
2103 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2104{
2105 int width, precision, longflag, longlongflag, size_tflag;
2106
2107 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2108 f++;
2109 width = 0;
2110 while (Py_ISDIGIT((unsigned)*f))
2111 width = (width*10) + *f++ - '0';
2112 precision = 0;
2113 if (*f == '.') {
2114 f++;
2115 while (Py_ISDIGIT((unsigned)*f))
2116 precision = (precision*10) + *f++ - '0';
2117 if (*f == '%') {
2118 /* "%.3%s" => f points to "3" */
2119 f--;
2120 }
2121 }
2122 if (*f == '\0') {
2123 /* bogus format "%.1" => go backward, f points to "1" */
2124 f--;
2125 }
2126 if (p_width != NULL)
2127 *p_width = width;
2128 if (p_precision != NULL)
2129 *p_precision = precision;
2130
2131 /* Handle %ld, %lu, %lld and %llu. */
2132 longflag = 0;
2133 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002134 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002135
2136 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002137 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002138 longflag = 1;
2139 ++f;
2140 }
2141#ifdef HAVE_LONG_LONG
2142 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002143 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002144 longlongflag = 1;
2145 f += 2;
2146 }
2147#endif
2148 }
2149 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002150 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002151 size_tflag = 1;
2152 ++f;
2153 }
2154 if (p_longflag != NULL)
2155 *p_longflag = longflag;
2156 if (p_longlongflag != NULL)
2157 *p_longlongflag = longlongflag;
2158 if (p_size_tflag != NULL)
2159 *p_size_tflag = size_tflag;
2160 return f;
2161}
2162
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002163/* maximum number of characters required for output of %ld. 21 characters
2164 allows for 64-bit integers (in decimal) and an optional sign. */
2165#define MAX_LONG_CHARS 21
2166/* maximum number of characters required for output of %lld.
2167 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2168 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2169#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2170
Walter Dörwaldd2034312007-05-18 16:29:38 +00002171PyObject *
2172PyUnicode_FromFormatV(const char *format, va_list vargs)
2173{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002174 va_list count;
2175 Py_ssize_t callcount = 0;
2176 PyObject **callresults = NULL;
2177 PyObject **callresult = NULL;
2178 Py_ssize_t n = 0;
2179 int width = 0;
2180 int precision = 0;
2181 int zeropad;
2182 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002183 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002184 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002185 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2187 Py_UCS4 argmaxchar;
2188 Py_ssize_t numbersize = 0;
2189 char *numberresults = NULL;
2190 char *numberresult = NULL;
2191 Py_ssize_t i;
2192 int kind;
2193 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002194
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002195 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002196 /* step 1: count the number of %S/%R/%A/%s format specifications
2197 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2198 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002200 * also estimate a upper bound for all the number formats in the string,
2201 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002202 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002203 for (f = format; *f; f++) {
2204 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002205 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2207 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2208 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2209 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002212#ifdef HAVE_LONG_LONG
2213 if (longlongflag) {
2214 if (width < MAX_LONG_LONG_CHARS)
2215 width = MAX_LONG_LONG_CHARS;
2216 }
2217 else
2218#endif
2219 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2220 including sign. Decimal takes the most space. This
2221 isn't enough for octal. If a width is specified we
2222 need more (which we allocate later). */
2223 if (width < MAX_LONG_CHARS)
2224 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225
2226 /* account for the size + '\0' to separate numbers
2227 inside of the numberresults buffer */
2228 numbersize += (width + 1);
2229 }
2230 }
2231 else if ((unsigned char)*f > 127) {
2232 PyErr_Format(PyExc_ValueError,
2233 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2234 "string, got a non-ASCII byte: 0x%02x",
2235 (unsigned char)*f);
2236 return NULL;
2237 }
2238 }
2239 /* step 2: allocate memory for the results of
2240 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2241 if (callcount) {
2242 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2243 if (!callresults) {
2244 PyErr_NoMemory();
2245 return NULL;
2246 }
2247 callresult = callresults;
2248 }
2249 /* step 2.5: allocate memory for the results of formating numbers */
2250 if (numbersize) {
2251 numberresults = PyObject_Malloc(numbersize);
2252 if (!numberresults) {
2253 PyErr_NoMemory();
2254 goto fail;
2255 }
2256 numberresult = numberresults;
2257 }
2258
2259 /* step 3: format numbers and figure out how large a buffer we need */
2260 for (f = format; *f; f++) {
2261 if (*f == '%') {
2262 const char* p;
2263 int longflag;
2264 int longlongflag;
2265 int size_tflag;
2266 int numprinted;
2267
2268 p = f;
2269 zeropad = (f[1] == '0');
2270 f = parse_format_flags(f, &width, &precision,
2271 &longflag, &longlongflag, &size_tflag);
2272 switch (*f) {
2273 case 'c':
2274 {
2275 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002276 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 n++;
2278 break;
2279 }
2280 case '%':
2281 n++;
2282 break;
2283 case 'i':
2284 case 'd':
2285 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2286 width, precision, *f);
2287 if (longflag)
2288 numprinted = sprintf(numberresult, fmt,
2289 va_arg(count, long));
2290#ifdef HAVE_LONG_LONG
2291 else if (longlongflag)
2292 numprinted = sprintf(numberresult, fmt,
2293 va_arg(count, PY_LONG_LONG));
2294#endif
2295 else if (size_tflag)
2296 numprinted = sprintf(numberresult, fmt,
2297 va_arg(count, Py_ssize_t));
2298 else
2299 numprinted = sprintf(numberresult, fmt,
2300 va_arg(count, int));
2301 n += numprinted;
2302 /* advance by +1 to skip over the '\0' */
2303 numberresult += (numprinted + 1);
2304 assert(*(numberresult - 1) == '\0');
2305 assert(*(numberresult - 2) != '\0');
2306 assert(numprinted >= 0);
2307 assert(numberresult <= numberresults + numbersize);
2308 break;
2309 case 'u':
2310 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2311 width, precision, 'u');
2312 if (longflag)
2313 numprinted = sprintf(numberresult, fmt,
2314 va_arg(count, unsigned long));
2315#ifdef HAVE_LONG_LONG
2316 else if (longlongflag)
2317 numprinted = sprintf(numberresult, fmt,
2318 va_arg(count, unsigned PY_LONG_LONG));
2319#endif
2320 else if (size_tflag)
2321 numprinted = sprintf(numberresult, fmt,
2322 va_arg(count, size_t));
2323 else
2324 numprinted = sprintf(numberresult, fmt,
2325 va_arg(count, unsigned int));
2326 n += numprinted;
2327 numberresult += (numprinted + 1);
2328 assert(*(numberresult - 1) == '\0');
2329 assert(*(numberresult - 2) != '\0');
2330 assert(numprinted >= 0);
2331 assert(numberresult <= numberresults + numbersize);
2332 break;
2333 case 'x':
2334 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2335 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2336 n += numprinted;
2337 numberresult += (numprinted + 1);
2338 assert(*(numberresult - 1) == '\0');
2339 assert(*(numberresult - 2) != '\0');
2340 assert(numprinted >= 0);
2341 assert(numberresult <= numberresults + numbersize);
2342 break;
2343 case 'p':
2344 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2345 /* %p is ill-defined: ensure leading 0x. */
2346 if (numberresult[1] == 'X')
2347 numberresult[1] = 'x';
2348 else if (numberresult[1] != 'x') {
2349 memmove(numberresult + 2, numberresult,
2350 strlen(numberresult) + 1);
2351 numberresult[0] = '0';
2352 numberresult[1] = 'x';
2353 numprinted += 2;
2354 }
2355 n += numprinted;
2356 numberresult += (numprinted + 1);
2357 assert(*(numberresult - 1) == '\0');
2358 assert(*(numberresult - 2) != '\0');
2359 assert(numprinted >= 0);
2360 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 break;
2362 case 's':
2363 {
2364 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002365 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002366 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2367 if (!str)
2368 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 /* since PyUnicode_DecodeUTF8 returns already flexible
2370 unicode objects, there is no need to call ready on them */
2371 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002372 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002373 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002374 /* Remember the str and switch to the next slot */
2375 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002376 break;
2377 }
2378 case 'U':
2379 {
2380 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002381 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 if (PyUnicode_READY(obj) == -1)
2383 goto fail;
2384 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002385 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 break;
2388 }
2389 case 'V':
2390 {
2391 PyObject *obj = va_arg(count, PyObject *);
2392 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002393 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002394 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002395 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002396 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 if (PyUnicode_READY(obj) == -1)
2398 goto fail;
2399 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002400 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002401 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002402 *callresult++ = NULL;
2403 }
2404 else {
2405 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2406 if (!str_obj)
2407 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002408 if (PyUnicode_READY(str_obj)) {
2409 Py_DECREF(str_obj);
2410 goto fail;
2411 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002412 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002413 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002414 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002415 *callresult++ = str_obj;
2416 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002417 break;
2418 }
2419 case 'S':
2420 {
2421 PyObject *obj = va_arg(count, PyObject *);
2422 PyObject *str;
2423 assert(obj);
2424 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002425 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002427 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002428 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002430 /* Remember the str and switch to the next slot */
2431 *callresult++ = str;
2432 break;
2433 }
2434 case 'R':
2435 {
2436 PyObject *obj = va_arg(count, PyObject *);
2437 PyObject *repr;
2438 assert(obj);
2439 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002440 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002441 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002442 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002443 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002445 /* Remember the repr and switch to the next slot */
2446 *callresult++ = repr;
2447 break;
2448 }
2449 case 'A':
2450 {
2451 PyObject *obj = va_arg(count, PyObject *);
2452 PyObject *ascii;
2453 assert(obj);
2454 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002455 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002456 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002457 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002458 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002460 /* Remember the repr and switch to the next slot */
2461 *callresult++ = ascii;
2462 break;
2463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002464 default:
2465 /* if we stumble upon an unknown
2466 formatting code, copy the rest of
2467 the format string to the output
2468 string. (we cannot just skip the
2469 code, since there's no way to know
2470 what's in the argument list) */
2471 n += strlen(p);
2472 goto expand;
2473 }
2474 } else
2475 n++;
2476 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002477 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002480 we don't have to resize the string.
2481 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002482 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002483 if (!string)
2484 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 kind = PyUnicode_KIND(string);
2486 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002487 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002490 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002491 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002492 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002493
2494 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2496 /* checking for == because the last argument could be a empty
2497 string, which causes i to point to end, the assert at the end of
2498 the loop */
2499 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002500
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 switch (*f) {
2502 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002503 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 const int ordinal = va_arg(vargs, int);
2505 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002506 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002507 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002508 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002509 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002511 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512 case 'p':
2513 /* unused, since we already have the result */
2514 if (*f == 'p')
2515 (void) va_arg(vargs, void *);
2516 else
2517 (void) va_arg(vargs, int);
2518 /* extract the result from numberresults and append. */
2519 for (; *numberresult; ++i, ++numberresult)
2520 PyUnicode_WRITE(kind, data, i, *numberresult);
2521 /* skip over the separating '\0' */
2522 assert(*numberresult == '\0');
2523 numberresult++;
2524 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 break;
2526 case 's':
2527 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002528 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002529 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002530 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 size = PyUnicode_GET_LENGTH(*callresult);
2532 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002533 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002535 /* We're done with the unicode()/repr() => forget it */
2536 Py_DECREF(*callresult);
2537 /* switch to next unicode()/repr() result */
2538 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
2540 }
2541 case 'U':
2542 {
2543 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002544 Py_ssize_t size;
2545 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2546 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002547 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002549 break;
2550 }
2551 case 'V':
2552 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002554 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002555 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002556 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 size = PyUnicode_GET_LENGTH(obj);
2558 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002559 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002560 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002561 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 size = PyUnicode_GET_LENGTH(*callresult);
2563 assert(PyUnicode_KIND(*callresult) <=
2564 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002565 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002567 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002569 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 break;
2571 }
2572 case 'S':
2573 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002574 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002576 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 /* unused, since we already have the result */
2578 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002579 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002580 copy_characters(string, i, *callresult, 0, size);
2581 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 /* We're done with the unicode()/repr() => forget it */
2583 Py_DECREF(*callresult);
2584 /* switch to next unicode()/repr() result */
2585 ++callresult;
2586 break;
2587 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002588 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 break;
2591 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 for (; *p; ++p, ++i)
2593 PyUnicode_WRITE(kind, data, i, *p);
2594 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002595 goto end;
2596 }
Victor Stinner1205f272010-09-11 00:54:47 +00002597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 else {
2599 assert(i < PyUnicode_GET_LENGTH(string));
2600 PyUnicode_WRITE(kind, data, i++, *f);
2601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002604
Benjamin Peterson29060642009-01-31 22:14:21 +00002605 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002606 if (callresults)
2607 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 if (numberresults)
2609 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002610 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002612 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002613 if (callresults) {
2614 PyObject **callresult2 = callresults;
2615 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002616 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002617 ++callresult2;
2618 }
2619 PyObject_Free(callresults);
2620 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621 if (numberresults)
2622 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002623 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002624}
2625
Walter Dörwaldd2034312007-05-18 16:29:38 +00002626PyObject *
2627PyUnicode_FromFormat(const char *format, ...)
2628{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 PyObject* ret;
2630 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002631
2632#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002633 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002634#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002635 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002636#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 ret = PyUnicode_FromFormatV(format, vargs);
2638 va_end(vargs);
2639 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640}
2641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642#ifdef HAVE_WCHAR_H
2643
Victor Stinner5593d8a2010-10-02 11:11:27 +00002644/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2645 convert a Unicode object to a wide character string.
2646
Victor Stinnerd88d9832011-09-06 02:00:05 +02002647 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002648 character) required to convert the unicode object. Ignore size argument.
2649
Victor Stinnerd88d9832011-09-06 02:00:05 +02002650 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002651 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002652 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002653static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002654unicode_aswidechar(PyUnicodeObject *unicode,
2655 wchar_t *w,
2656 Py_ssize_t size)
2657{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002658 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 const wchar_t *wstr;
2660
2661 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2662 if (wstr == NULL)
2663 return -1;
2664
Victor Stinner5593d8a2010-10-02 11:11:27 +00002665 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002666 if (size > res)
2667 size = res + 1;
2668 else
2669 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002670 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002671 return res;
2672 }
2673 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002674 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002675}
2676
2677Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002678PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002679 wchar_t *w,
2680 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681{
2682 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002683 PyErr_BadInternalCall();
2684 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002685 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002686 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687}
2688
Victor Stinner137c34c2010-09-29 10:25:54 +00002689wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002690PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002691 Py_ssize_t *size)
2692{
2693 wchar_t* buffer;
2694 Py_ssize_t buflen;
2695
2696 if (unicode == NULL) {
2697 PyErr_BadInternalCall();
2698 return NULL;
2699 }
2700
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002701 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002702 if (buflen == -1)
2703 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002704 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002705 PyErr_NoMemory();
2706 return NULL;
2707 }
2708
Victor Stinner137c34c2010-09-29 10:25:54 +00002709 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2710 if (buffer == NULL) {
2711 PyErr_NoMemory();
2712 return NULL;
2713 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002714 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002715 if (buflen == -1)
2716 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002717 if (size != NULL)
2718 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002719 return buffer;
2720}
2721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002722#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723
Alexander Belopolsky40018472011-02-26 01:02:56 +00002724PyObject *
2725PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002726{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002727 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002728 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002729 PyErr_SetString(PyExc_ValueError,
2730 "chr() arg not in range(0x110000)");
2731 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002732 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002734 if (ordinal < 256)
2735 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002737 v = PyUnicode_New(1, ordinal);
2738 if (v == NULL)
2739 return NULL;
2740 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002741 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002742 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002743}
2744
Alexander Belopolsky40018472011-02-26 01:02:56 +00002745PyObject *
2746PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002748 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002749 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002750 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002751 if (PyUnicode_READY(obj))
2752 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002753 Py_INCREF(obj);
2754 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002755 }
2756 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002757 /* For a Unicode subtype that's not a Unicode object,
2758 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002759 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002760 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002761 PyErr_Format(PyExc_TypeError,
2762 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002763 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002764 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002765}
2766
Alexander Belopolsky40018472011-02-26 01:02:56 +00002767PyObject *
2768PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002769 const char *encoding,
2770 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002771{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002772 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002773 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002774
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002776 PyErr_BadInternalCall();
2777 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002779
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002780 /* Decoding bytes objects is the most common case and should be fast */
2781 if (PyBytes_Check(obj)) {
2782 if (PyBytes_GET_SIZE(obj) == 0) {
2783 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002784 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002785 }
2786 else {
2787 v = PyUnicode_Decode(
2788 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2789 encoding, errors);
2790 }
2791 return v;
2792 }
2793
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002794 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002795 PyErr_SetString(PyExc_TypeError,
2796 "decoding str is not supported");
2797 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002798 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002799
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002800 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2801 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2802 PyErr_Format(PyExc_TypeError,
2803 "coercing to str: need bytes, bytearray "
2804 "or buffer-like object, %.80s found",
2805 Py_TYPE(obj)->tp_name);
2806 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002807 }
Tim Petersced69f82003-09-16 20:30:58 +00002808
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002809 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002810 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002811 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 }
Tim Petersced69f82003-09-16 20:30:58 +00002813 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002814 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002815
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002816 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002817 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818}
2819
Victor Stinner600d3be2010-06-10 12:00:55 +00002820/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002821 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2822 1 on success. */
2823static int
2824normalize_encoding(const char *encoding,
2825 char *lower,
2826 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002828 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002829 char *l;
2830 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002831
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002832 e = encoding;
2833 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002834 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002835 while (*e) {
2836 if (l == l_end)
2837 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002838 if (Py_ISUPPER(*e)) {
2839 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002840 }
2841 else if (*e == '_') {
2842 *l++ = '-';
2843 e++;
2844 }
2845 else {
2846 *l++ = *e++;
2847 }
2848 }
2849 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002850 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002851}
2852
Alexander Belopolsky40018472011-02-26 01:02:56 +00002853PyObject *
2854PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002855 Py_ssize_t size,
2856 const char *encoding,
2857 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002858{
2859 PyObject *buffer = NULL, *unicode;
2860 Py_buffer info;
2861 char lower[11]; /* Enough for any encoding shortcut */
2862
2863 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002864 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002865
2866 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002867 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002868 if ((strcmp(lower, "utf-8") == 0) ||
2869 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002870 return PyUnicode_DecodeUTF8(s, size, errors);
2871 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002872 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002873 (strcmp(lower, "iso-8859-1") == 0))
2874 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002875#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002876 else if (strcmp(lower, "mbcs") == 0)
2877 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002878#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002879 else if (strcmp(lower, "ascii") == 0)
2880 return PyUnicode_DecodeASCII(s, size, errors);
2881 else if (strcmp(lower, "utf-16") == 0)
2882 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2883 else if (strcmp(lower, "utf-32") == 0)
2884 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2885 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002886
2887 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002888 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002889 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002890 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002891 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 if (buffer == NULL)
2893 goto onError;
2894 unicode = PyCodec_Decode(buffer, encoding, errors);
2895 if (unicode == NULL)
2896 goto onError;
2897 if (!PyUnicode_Check(unicode)) {
2898 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002899 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002900 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 Py_DECREF(unicode);
2902 goto onError;
2903 }
2904 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002905#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002906 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002907 Py_DECREF(unicode);
2908 return NULL;
2909 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002910#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002911 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002913
Benjamin Peterson29060642009-01-31 22:14:21 +00002914 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 Py_XDECREF(buffer);
2916 return NULL;
2917}
2918
Alexander Belopolsky40018472011-02-26 01:02:56 +00002919PyObject *
2920PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002921 const char *encoding,
2922 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002923{
2924 PyObject *v;
2925
2926 if (!PyUnicode_Check(unicode)) {
2927 PyErr_BadArgument();
2928 goto onError;
2929 }
2930
2931 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002932 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002933
2934 /* Decode via the codec registry */
2935 v = PyCodec_Decode(unicode, encoding, errors);
2936 if (v == NULL)
2937 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002938 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002939 return v;
2940
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002942 return NULL;
2943}
2944
Alexander Belopolsky40018472011-02-26 01:02:56 +00002945PyObject *
2946PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002947 const char *encoding,
2948 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002949{
2950 PyObject *v;
2951
2952 if (!PyUnicode_Check(unicode)) {
2953 PyErr_BadArgument();
2954 goto onError;
2955 }
2956
2957 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002958 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002959
2960 /* Decode via the codec registry */
2961 v = PyCodec_Decode(unicode, encoding, errors);
2962 if (v == NULL)
2963 goto onError;
2964 if (!PyUnicode_Check(v)) {
2965 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002966 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002967 Py_TYPE(v)->tp_name);
2968 Py_DECREF(v);
2969 goto onError;
2970 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002971 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002972 return v;
2973
Benjamin Peterson29060642009-01-31 22:14:21 +00002974 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002975 return NULL;
2976}
2977
Alexander Belopolsky40018472011-02-26 01:02:56 +00002978PyObject *
2979PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002980 Py_ssize_t size,
2981 const char *encoding,
2982 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983{
2984 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002985
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 unicode = PyUnicode_FromUnicode(s, size);
2987 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002988 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002989 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2990 Py_DECREF(unicode);
2991 return v;
2992}
2993
Alexander Belopolsky40018472011-02-26 01:02:56 +00002994PyObject *
2995PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002996 const char *encoding,
2997 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002998{
2999 PyObject *v;
3000
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_BadArgument();
3003 goto onError;
3004 }
3005
3006 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003007 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003008
3009 /* Encode via the codec registry */
3010 v = PyCodec_Encode(unicode, encoding, errors);
3011 if (v == NULL)
3012 goto onError;
3013 return v;
3014
Benjamin Peterson29060642009-01-31 22:14:21 +00003015 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003016 return NULL;
3017}
3018
Victor Stinnerad158722010-10-27 00:25:46 +00003019PyObject *
3020PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003021{
Victor Stinner99b95382011-07-04 14:23:54 +02003022#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003023 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3024 PyUnicode_GET_SIZE(unicode),
3025 NULL);
3026#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003027 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003028#else
Victor Stinner793b5312011-04-27 00:24:21 +02003029 PyInterpreterState *interp = PyThreadState_GET()->interp;
3030 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3031 cannot use it to encode and decode filenames before it is loaded. Load
3032 the Python codec requires to encode at least its own filename. Use the C
3033 version of the locale codec until the codec registry is initialized and
3034 the Python codec is loaded.
3035
3036 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3037 cannot only rely on it: check also interp->fscodec_initialized for
3038 subinterpreters. */
3039 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003040 return PyUnicode_AsEncodedString(unicode,
3041 Py_FileSystemDefaultEncoding,
3042 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003043 }
3044 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003045 /* locale encoding with surrogateescape */
3046 wchar_t *wchar;
3047 char *bytes;
3048 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003049 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003050
3051 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3052 if (wchar == NULL)
3053 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003054 bytes = _Py_wchar2char(wchar, &error_pos);
3055 if (bytes == NULL) {
3056 if (error_pos != (size_t)-1) {
3057 char *errmsg = strerror(errno);
3058 PyObject *exc = NULL;
3059 if (errmsg == NULL)
3060 errmsg = "Py_wchar2char() failed";
3061 raise_encode_exception(&exc,
3062 "filesystemencoding",
3063 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3064 error_pos, error_pos+1,
3065 errmsg);
3066 Py_XDECREF(exc);
3067 }
3068 else
3069 PyErr_NoMemory();
3070 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003071 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003072 }
3073 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003074
3075 bytes_obj = PyBytes_FromString(bytes);
3076 PyMem_Free(bytes);
3077 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003078 }
Victor Stinnerad158722010-10-27 00:25:46 +00003079#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003080}
3081
Alexander Belopolsky40018472011-02-26 01:02:56 +00003082PyObject *
3083PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003084 const char *encoding,
3085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
3087 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003088 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003089
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090 if (!PyUnicode_Check(unicode)) {
3091 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003093 }
Fred Drakee4315f52000-05-09 19:53:39 +00003094
Victor Stinner2f283c22011-03-02 01:21:46 +00003095 if (encoding == NULL) {
3096 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003097 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003098 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003099 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003100 }
Fred Drakee4315f52000-05-09 19:53:39 +00003101
3102 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003103 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003104 if ((strcmp(lower, "utf-8") == 0) ||
3105 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003106 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003107 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003109 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003111 }
Victor Stinner37296e82010-06-10 13:36:23 +00003112 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003113 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003114 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003116#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003117 else if (strcmp(lower, "mbcs") == 0)
3118 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3119 PyUnicode_GET_SIZE(unicode),
3120 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003121#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003122 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003124 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003125
3126 /* Encode via the codec registry */
3127 v = PyCodec_Encode(unicode, encoding, errors);
3128 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003129 return NULL;
3130
3131 /* The normal path */
3132 if (PyBytes_Check(v))
3133 return v;
3134
3135 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003136 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003137 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003138 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003139
3140 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3141 "encoder %s returned bytearray instead of bytes",
3142 encoding);
3143 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003144 Py_DECREF(v);
3145 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003146 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003147
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003148 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3149 Py_DECREF(v);
3150 return b;
3151 }
3152
3153 PyErr_Format(PyExc_TypeError,
3154 "encoder did not return a bytes object (type=%.400s)",
3155 Py_TYPE(v)->tp_name);
3156 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003157 return NULL;
3158}
3159
Alexander Belopolsky40018472011-02-26 01:02:56 +00003160PyObject *
3161PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003162 const char *encoding,
3163 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003164{
3165 PyObject *v;
3166
3167 if (!PyUnicode_Check(unicode)) {
3168 PyErr_BadArgument();
3169 goto onError;
3170 }
3171
3172 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003174
3175 /* Encode via the codec registry */
3176 v = PyCodec_Encode(unicode, encoding, errors);
3177 if (v == NULL)
3178 goto onError;
3179 if (!PyUnicode_Check(v)) {
3180 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003181 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003182 Py_TYPE(v)->tp_name);
3183 Py_DECREF(v);
3184 goto onError;
3185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003187
Benjamin Peterson29060642009-01-31 22:14:21 +00003188 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 return NULL;
3190}
3191
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003192PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003193PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003194 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003195 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3196}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003197
Christian Heimes5894ba72007-11-04 11:43:14 +00003198PyObject*
3199PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3200{
Victor Stinner99b95382011-07-04 14:23:54 +02003201#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003202 return PyUnicode_DecodeMBCS(s, size, NULL);
3203#elif defined(__APPLE__)
3204 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3205#else
Victor Stinner793b5312011-04-27 00:24:21 +02003206 PyInterpreterState *interp = PyThreadState_GET()->interp;
3207 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3208 cannot use it to encode and decode filenames before it is loaded. Load
3209 the Python codec requires to encode at least its own filename. Use the C
3210 version of the locale codec until the codec registry is initialized and
3211 the Python codec is loaded.
3212
3213 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3214 cannot only rely on it: check also interp->fscodec_initialized for
3215 subinterpreters. */
3216 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003217 return PyUnicode_Decode(s, size,
3218 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003219 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003220 }
3221 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003222 /* locale encoding with surrogateescape */
3223 wchar_t *wchar;
3224 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003225 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003226
3227 if (s[size] != '\0' || size != strlen(s)) {
3228 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3229 return NULL;
3230 }
3231
Victor Stinner168e1172010-10-16 23:16:16 +00003232 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003233 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003234 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003235
Victor Stinner168e1172010-10-16 23:16:16 +00003236 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003237 PyMem_Free(wchar);
3238 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003239 }
Victor Stinnerad158722010-10-27 00:25:46 +00003240#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003241}
3242
Martin v. Löwis011e8422009-05-05 04:43:17 +00003243
3244int
3245PyUnicode_FSConverter(PyObject* arg, void* addr)
3246{
3247 PyObject *output = NULL;
3248 Py_ssize_t size;
3249 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003250 if (arg == NULL) {
3251 Py_DECREF(*(PyObject**)addr);
3252 return 1;
3253 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003254 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003255 output = arg;
3256 Py_INCREF(output);
3257 }
3258 else {
3259 arg = PyUnicode_FromObject(arg);
3260 if (!arg)
3261 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003262 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003263 Py_DECREF(arg);
3264 if (!output)
3265 return 0;
3266 if (!PyBytes_Check(output)) {
3267 Py_DECREF(output);
3268 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3269 return 0;
3270 }
3271 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003272 size = PyBytes_GET_SIZE(output);
3273 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003274 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003275 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003276 Py_DECREF(output);
3277 return 0;
3278 }
3279 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003280 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003281}
3282
3283
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003284int
3285PyUnicode_FSDecoder(PyObject* arg, void* addr)
3286{
3287 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003288 if (arg == NULL) {
3289 Py_DECREF(*(PyObject**)addr);
3290 return 1;
3291 }
3292 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003293 if (PyUnicode_READY(arg))
3294 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003295 output = arg;
3296 Py_INCREF(output);
3297 }
3298 else {
3299 arg = PyBytes_FromObject(arg);
3300 if (!arg)
3301 return 0;
3302 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3303 PyBytes_GET_SIZE(arg));
3304 Py_DECREF(arg);
3305 if (!output)
3306 return 0;
3307 if (!PyUnicode_Check(output)) {
3308 Py_DECREF(output);
3309 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3310 return 0;
3311 }
3312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003313 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3314 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003315 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3316 Py_DECREF(output);
3317 return 0;
3318 }
3319 *(PyObject**)addr = output;
3320 return Py_CLEANUP_SUPPORTED;
3321}
3322
3323
Martin v. Löwis5b222132007-06-10 09:51:05 +00003324char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003325PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003326{
Christian Heimesf3863112007-11-22 07:46:41 +00003327 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003328 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3329
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003330 if (!PyUnicode_Check(unicode)) {
3331 PyErr_BadArgument();
3332 return NULL;
3333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003334 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003335 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003337 if (PyUnicode_UTF8(unicode) == NULL) {
3338 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003339 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3340 if (bytes == NULL)
3341 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003342 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3343 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003344 Py_DECREF(bytes);
3345 return NULL;
3346 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003347 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3348 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003349 Py_DECREF(bytes);
3350 }
3351
3352 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003353 *psize = PyUnicode_UTF8_LENGTH(unicode);
3354 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003355}
3356
3357char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003360 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3361}
3362
3363#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003364static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003365#endif
3366
3367
3368Py_UNICODE *
3369PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3370{
3371 PyUnicodeObject *u;
3372 const unsigned char *one_byte;
3373#if SIZEOF_WCHAR_T == 4
3374 const Py_UCS2 *two_bytes;
3375#else
3376 const Py_UCS4 *four_bytes;
3377 const Py_UCS4 *ucs4_end;
3378 Py_ssize_t num_surrogates;
3379#endif
3380 wchar_t *w;
3381 wchar_t *wchar_end;
3382
3383 if (!PyUnicode_Check(unicode)) {
3384 PyErr_BadArgument();
3385 return NULL;
3386 }
3387 u = (PyUnicodeObject*)unicode;
3388 if (_PyUnicode_WSTR(u) == NULL) {
3389 /* Non-ASCII compact unicode object */
3390 assert(_PyUnicode_KIND(u) != 0);
3391 assert(PyUnicode_IS_READY(u));
3392
3393#ifdef Py_DEBUG
3394 ++unicode_as_unicode_calls;
3395#endif
3396
3397 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3398#if SIZEOF_WCHAR_T == 2
3399 four_bytes = PyUnicode_4BYTE_DATA(u);
3400 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3401 num_surrogates = 0;
3402
3403 for (; four_bytes < ucs4_end; ++four_bytes) {
3404 if (*four_bytes > 0xFFFF)
3405 ++num_surrogates;
3406 }
3407
3408 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3409 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3410 if (!_PyUnicode_WSTR(u)) {
3411 PyErr_NoMemory();
3412 return NULL;
3413 }
3414 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3415
3416 w = _PyUnicode_WSTR(u);
3417 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3418 four_bytes = PyUnicode_4BYTE_DATA(u);
3419 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3420 if (*four_bytes > 0xFFFF) {
3421 /* encode surrogate pair in this case */
3422 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3423 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3424 }
3425 else
3426 *w = *four_bytes;
3427
3428 if (w > wchar_end) {
3429 assert(0 && "Miscalculated string end");
3430 }
3431 }
3432 *w = 0;
3433#else
3434 /* sizeof(wchar_t) == 4 */
3435 Py_FatalError("Impossible unicode object state, wstr and str "
3436 "should share memory already.");
3437 return NULL;
3438#endif
3439 }
3440 else {
3441 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3442 (_PyUnicode_LENGTH(u) + 1));
3443 if (!_PyUnicode_WSTR(u)) {
3444 PyErr_NoMemory();
3445 return NULL;
3446 }
3447 if (!PyUnicode_IS_COMPACT_ASCII(u))
3448 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3449 w = _PyUnicode_WSTR(u);
3450 wchar_end = w + _PyUnicode_LENGTH(u);
3451
3452 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3453 one_byte = PyUnicode_1BYTE_DATA(u);
3454 for (; w < wchar_end; ++one_byte, ++w)
3455 *w = *one_byte;
3456 /* null-terminate the wstr */
3457 *w = 0;
3458 }
3459 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3460#if SIZEOF_WCHAR_T == 4
3461 two_bytes = PyUnicode_2BYTE_DATA(u);
3462 for (; w < wchar_end; ++two_bytes, ++w)
3463 *w = *two_bytes;
3464 /* null-terminate the wstr */
3465 *w = 0;
3466#else
3467 /* sizeof(wchar_t) == 2 */
3468 PyObject_FREE(_PyUnicode_WSTR(u));
3469 _PyUnicode_WSTR(u) = NULL;
3470 Py_FatalError("Impossible unicode object state, wstr "
3471 "and str should share memory already.");
3472 return NULL;
3473#endif
3474 }
3475 else {
3476 assert(0 && "This should never happen.");
3477 }
3478 }
3479 }
3480 if (size != NULL)
3481 *size = PyUnicode_WSTR_LENGTH(u);
3482 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003483}
3484
Alexander Belopolsky40018472011-02-26 01:02:56 +00003485Py_UNICODE *
3486PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003488 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489}
3490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003491
Alexander Belopolsky40018472011-02-26 01:02:56 +00003492Py_ssize_t
3493PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494{
3495 if (!PyUnicode_Check(unicode)) {
3496 PyErr_BadArgument();
3497 goto onError;
3498 }
3499 return PyUnicode_GET_SIZE(unicode);
3500
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 return -1;
3503}
3504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003505Py_ssize_t
3506PyUnicode_GetLength(PyObject *unicode)
3507{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003508 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003509 PyErr_BadArgument();
3510 return -1;
3511 }
3512
3513 return PyUnicode_GET_LENGTH(unicode);
3514}
3515
3516Py_UCS4
3517PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3518{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003519 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3520 PyErr_BadArgument();
3521 return (Py_UCS4)-1;
3522 }
3523 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3524 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003525 return (Py_UCS4)-1;
3526 }
3527 return PyUnicode_READ_CHAR(unicode, index);
3528}
3529
3530int
3531PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3532{
3533 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003534 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003535 return -1;
3536 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003537 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3538 PyErr_SetString(PyExc_IndexError, "string index out of range");
3539 return -1;
3540 }
3541 if (_PyUnicode_Dirty(unicode))
3542 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003543 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3544 index, ch);
3545 return 0;
3546}
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548const char *
3549PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003550{
Victor Stinner42cb4622010-09-01 19:39:01 +00003551 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003552}
3553
Victor Stinner554f3f02010-06-16 23:33:54 +00003554/* create or adjust a UnicodeDecodeError */
3555static void
3556make_decode_exception(PyObject **exceptionObject,
3557 const char *encoding,
3558 const char *input, Py_ssize_t length,
3559 Py_ssize_t startpos, Py_ssize_t endpos,
3560 const char *reason)
3561{
3562 if (*exceptionObject == NULL) {
3563 *exceptionObject = PyUnicodeDecodeError_Create(
3564 encoding, input, length, startpos, endpos, reason);
3565 }
3566 else {
3567 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3568 goto onError;
3569 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3570 goto onError;
3571 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3572 goto onError;
3573 }
3574 return;
3575
3576onError:
3577 Py_DECREF(*exceptionObject);
3578 *exceptionObject = NULL;
3579}
3580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581/* error handling callback helper:
3582 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003583 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 and adjust various state variables.
3585 return 0 on success, -1 on error
3586*/
3587
Alexander Belopolsky40018472011-02-26 01:02:56 +00003588static int
3589unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003590 const char *encoding, const char *reason,
3591 const char **input, const char **inend, Py_ssize_t *startinpos,
3592 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3593 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003594{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003595 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596
3597 PyObject *restuple = NULL;
3598 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003599 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003600 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003601 Py_ssize_t requiredsize;
3602 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003603 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003604 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003605 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606 int res = -1;
3607
3608 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003609 *errorHandler = PyCodec_LookupError(errors);
3610 if (*errorHandler == NULL)
3611 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 }
3613
Victor Stinner554f3f02010-06-16 23:33:54 +00003614 make_decode_exception(exceptionObject,
3615 encoding,
3616 *input, *inend - *input,
3617 *startinpos, *endinpos,
3618 reason);
3619 if (*exceptionObject == NULL)
3620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621
3622 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3623 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003624 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003625 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003626 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003628 }
3629 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003630 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003631
3632 /* Copy back the bytes variables, which might have been modified by the
3633 callback */
3634 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3635 if (!inputobj)
3636 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003637 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003638 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003639 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003640 *input = PyBytes_AS_STRING(inputobj);
3641 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003642 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003643 /* we can DECREF safely, as the exception has another reference,
3644 so the object won't go away. */
3645 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003646
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003649 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3651 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003652 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003653
3654 /* need more space? (at least enough for what we
3655 have+the replacement+the rest of the string (starting
3656 at the new input position), so we won't have to check space
3657 when there are no errors in the rest of the string) */
3658 repptr = PyUnicode_AS_UNICODE(repunicode);
3659 repsize = PyUnicode_GET_SIZE(repunicode);
3660 requiredsize = *outpos + repsize + insize-newpos;
3661 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 if (requiredsize<2*outsize)
3663 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003664 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003665 goto onError;
3666 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 }
3668 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003669 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003670 Py_UNICODE_COPY(*outptr, repptr, repsize);
3671 *outptr += repsize;
3672 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 /* we made it! */
3675 res = 0;
3676
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 Py_XDECREF(restuple);
3679 return res;
3680}
3681
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003682/* --- UTF-7 Codec -------------------------------------------------------- */
3683
Antoine Pitrou244651a2009-05-04 18:56:13 +00003684/* See RFC2152 for details. We encode conservatively and decode liberally. */
3685
3686/* Three simple macros defining base-64. */
3687
3688/* Is c a base-64 character? */
3689
3690#define IS_BASE64(c) \
3691 (((c) >= 'A' && (c) <= 'Z') || \
3692 ((c) >= 'a' && (c) <= 'z') || \
3693 ((c) >= '0' && (c) <= '9') || \
3694 (c) == '+' || (c) == '/')
3695
3696/* given that c is a base-64 character, what is its base-64 value? */
3697
3698#define FROM_BASE64(c) \
3699 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3700 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3701 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3702 (c) == '+' ? 62 : 63)
3703
3704/* What is the base-64 character of the bottom 6 bits of n? */
3705
3706#define TO_BASE64(n) \
3707 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3708
3709/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3710 * decoded as itself. We are permissive on decoding; the only ASCII
3711 * byte not decoding to itself is the + which begins a base64
3712 * string. */
3713
3714#define DECODE_DIRECT(c) \
3715 ((c) <= 127 && (c) != '+')
3716
3717/* The UTF-7 encoder treats ASCII characters differently according to
3718 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3719 * the above). See RFC2152. This array identifies these different
3720 * sets:
3721 * 0 : "Set D"
3722 * alphanumeric and '(),-./:?
3723 * 1 : "Set O"
3724 * !"#$%&*;<=>@[]^_`{|}
3725 * 2 : "whitespace"
3726 * ht nl cr sp
3727 * 3 : special (must be base64 encoded)
3728 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3729 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003730
Tim Petersced69f82003-09-16 20:30:58 +00003731static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003732char utf7_category[128] = {
3733/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3734 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3735/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3736 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3737/* sp ! " # $ % & ' ( ) * + , - . / */
3738 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3739/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3740 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3741/* @ A B C D E F G H I J K L M N O */
3742 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3743/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3744 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3745/* ` a b c d e f g h i j k l m n o */
3746 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3747/* p q r s t u v w x y z { | } ~ del */
3748 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003749};
3750
Antoine Pitrou244651a2009-05-04 18:56:13 +00003751/* ENCODE_DIRECT: this character should be encoded as itself. The
3752 * answer depends on whether we are encoding set O as itself, and also
3753 * on whether we are encoding whitespace as itself. RFC2152 makes it
3754 * clear that the answers to these questions vary between
3755 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003756
Antoine Pitrou244651a2009-05-04 18:56:13 +00003757#define ENCODE_DIRECT(c, directO, directWS) \
3758 ((c) < 128 && (c) > 0 && \
3759 ((utf7_category[(c)] == 0) || \
3760 (directWS && (utf7_category[(c)] == 2)) || \
3761 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003762
Alexander Belopolsky40018472011-02-26 01:02:56 +00003763PyObject *
3764PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003765 Py_ssize_t size,
3766 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003767{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003768 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3769}
3770
Antoine Pitrou244651a2009-05-04 18:56:13 +00003771/* The decoder. The only state we preserve is our read position,
3772 * i.e. how many characters we have consumed. So if we end in the
3773 * middle of a shift sequence we have to back off the read position
3774 * and the output to the beginning of the sequence, otherwise we lose
3775 * all the shift state (seen bits, number of bits seen, high
3776 * surrogate). */
3777
Alexander Belopolsky40018472011-02-26 01:02:56 +00003778PyObject *
3779PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003780 Py_ssize_t size,
3781 const char *errors,
3782 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003783{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003784 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003785 Py_ssize_t startinpos;
3786 Py_ssize_t endinpos;
3787 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003788 const char *e;
3789 PyUnicodeObject *unicode;
3790 Py_UNICODE *p;
3791 const char *errmsg = "";
3792 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003793 Py_UNICODE *shiftOutStart;
3794 unsigned int base64bits = 0;
3795 unsigned long base64buffer = 0;
3796 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003797 PyObject *errorHandler = NULL;
3798 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003799
3800 unicode = _PyUnicode_New(size);
3801 if (!unicode)
3802 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003803 if (size == 0) {
3804 if (consumed)
3805 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003806 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003807 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003810 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811 e = s + size;
3812
3813 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003814 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003815 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003816 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817
Antoine Pitrou244651a2009-05-04 18:56:13 +00003818 if (inShift) { /* in a base-64 section */
3819 if (IS_BASE64(ch)) { /* consume a base-64 character */
3820 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3821 base64bits += 6;
3822 s++;
3823 if (base64bits >= 16) {
3824 /* we have enough bits for a UTF-16 value */
3825 Py_UNICODE outCh = (Py_UNICODE)
3826 (base64buffer >> (base64bits-16));
3827 base64bits -= 16;
3828 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3829 if (surrogate) {
3830 /* expecting a second surrogate */
3831 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3832#ifdef Py_UNICODE_WIDE
3833 *p++ = (((surrogate & 0x3FF)<<10)
3834 | (outCh & 0x3FF)) + 0x10000;
3835#else
3836 *p++ = surrogate;
3837 *p++ = outCh;
3838#endif
3839 surrogate = 0;
3840 }
3841 else {
3842 surrogate = 0;
3843 errmsg = "second surrogate missing";
3844 goto utf7Error;
3845 }
3846 }
3847 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3848 /* first surrogate */
3849 surrogate = outCh;
3850 }
3851 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3852 errmsg = "unexpected second surrogate";
3853 goto utf7Error;
3854 }
3855 else {
3856 *p++ = outCh;
3857 }
3858 }
3859 }
3860 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003861 inShift = 0;
3862 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003863 if (surrogate) {
3864 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003865 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003866 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003867 if (base64bits > 0) { /* left-over bits */
3868 if (base64bits >= 6) {
3869 /* We've seen at least one base-64 character */
3870 errmsg = "partial character in shift sequence";
3871 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003872 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003873 else {
3874 /* Some bits remain; they should be zero */
3875 if (base64buffer != 0) {
3876 errmsg = "non-zero padding bits in shift sequence";
3877 goto utf7Error;
3878 }
3879 }
3880 }
3881 if (ch != '-') {
3882 /* '-' is absorbed; other terminating
3883 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884 *p++ = ch;
3885 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003886 }
3887 }
3888 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003889 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003890 s++; /* consume '+' */
3891 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003892 s++;
3893 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003894 }
3895 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003896 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003897 shiftOutStart = p;
3898 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899 }
3900 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003901 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003902 *p++ = ch;
3903 s++;
3904 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003905 else {
3906 startinpos = s-starts;
3907 s++;
3908 errmsg = "unexpected special character";
3909 goto utf7Error;
3910 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003911 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003912utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003913 outpos = p-PyUnicode_AS_UNICODE(unicode);
3914 endinpos = s-starts;
3915 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003916 errors, &errorHandler,
3917 "utf7", errmsg,
3918 &starts, &e, &startinpos, &endinpos, &exc, &s,
3919 &unicode, &outpos, &p))
3920 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003921 }
3922
Antoine Pitrou244651a2009-05-04 18:56:13 +00003923 /* end of string */
3924
3925 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3926 /* if we're in an inconsistent state, that's an error */
3927 if (surrogate ||
3928 (base64bits >= 6) ||
3929 (base64bits > 0 && base64buffer != 0)) {
3930 outpos = p-PyUnicode_AS_UNICODE(unicode);
3931 endinpos = size;
3932 if (unicode_decode_call_errorhandler(
3933 errors, &errorHandler,
3934 "utf7", "unterminated shift sequence",
3935 &starts, &e, &startinpos, &endinpos, &exc, &s,
3936 &unicode, &outpos, &p))
3937 goto onError;
3938 if (s < e)
3939 goto restart;
3940 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003941 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003942
3943 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003944 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003945 if (inShift) {
3946 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003947 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003948 }
3949 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003950 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003952 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003953
Victor Stinnerfe226c02011-10-03 03:52:20 +02003954 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003955 goto onError;
3956
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003957 Py_XDECREF(errorHandler);
3958 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003959#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003960 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 Py_DECREF(unicode);
3962 return NULL;
3963 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003964#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003965 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003966 return (PyObject *)unicode;
3967
Benjamin Peterson29060642009-01-31 22:14:21 +00003968 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 Py_XDECREF(errorHandler);
3970 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003971 Py_DECREF(unicode);
3972 return NULL;
3973}
3974
3975
Alexander Belopolsky40018472011-02-26 01:02:56 +00003976PyObject *
3977PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003978 Py_ssize_t size,
3979 int base64SetO,
3980 int base64WhiteSpace,
3981 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003982{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003983 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003984 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003985 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003986 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003987 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003988 unsigned int base64bits = 0;
3989 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003990 char * out;
3991 char * start;
3992
3993 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003994 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003995
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003996 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003997 return PyErr_NoMemory();
3998
Antoine Pitrou244651a2009-05-04 18:56:13 +00003999 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004000 if (v == NULL)
4001 return NULL;
4002
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004003 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004 for (;i < size; ++i) {
4005 Py_UNICODE ch = s[i];
4006
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 if (inShift) {
4008 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4009 /* shifting out */
4010 if (base64bits) { /* output remaining bits */
4011 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4012 base64buffer = 0;
4013 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004014 }
4015 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 /* Characters not in the BASE64 set implicitly unshift the sequence
4017 so no '-' is required, except if the character is itself a '-' */
4018 if (IS_BASE64(ch) || ch == '-') {
4019 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004020 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004021 *out++ = (char) ch;
4022 }
4023 else {
4024 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004025 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004027 else { /* not in a shift sequence */
4028 if (ch == '+') {
4029 *out++ = '+';
4030 *out++ = '-';
4031 }
4032 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4033 *out++ = (char) ch;
4034 }
4035 else {
4036 *out++ = '+';
4037 inShift = 1;
4038 goto encode_char;
4039 }
4040 }
4041 continue;
4042encode_char:
4043#ifdef Py_UNICODE_WIDE
4044 if (ch >= 0x10000) {
4045 /* code first surrogate */
4046 base64bits += 16;
4047 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4048 while (base64bits >= 6) {
4049 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4050 base64bits -= 6;
4051 }
4052 /* prepare second surrogate */
4053 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4054 }
4055#endif
4056 base64bits += 16;
4057 base64buffer = (base64buffer << 16) | ch;
4058 while (base64bits >= 6) {
4059 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4060 base64bits -= 6;
4061 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004062 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004063 if (base64bits)
4064 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4065 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004066 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004067 if (_PyBytes_Resize(&v, out - start) < 0)
4068 return NULL;
4069 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004070}
4071
Antoine Pitrou244651a2009-05-04 18:56:13 +00004072#undef IS_BASE64
4073#undef FROM_BASE64
4074#undef TO_BASE64
4075#undef DECODE_DIRECT
4076#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004077
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078/* --- UTF-8 Codec -------------------------------------------------------- */
4079
Tim Petersced69f82003-09-16 20:30:58 +00004080static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004082 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4083 illegal prefix. See RFC 3629 for details */
4084 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4092 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4096 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4098 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4099 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100};
4101
Alexander Belopolsky40018472011-02-26 01:02:56 +00004102PyObject *
4103PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004104 Py_ssize_t size,
4105 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106{
Walter Dörwald69652032004-09-07 20:24:22 +00004107 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4108}
4109
Antoine Pitrouab868312009-01-10 15:40:25 +00004110/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4111#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4112
4113/* Mask to quickly check whether a C 'long' contains a
4114 non-ASCII, UTF8-encoded char. */
4115#if (SIZEOF_LONG == 8)
4116# define ASCII_CHAR_MASK 0x8080808080808080L
4117#elif (SIZEOF_LONG == 4)
4118# define ASCII_CHAR_MASK 0x80808080L
4119#else
4120# error C 'long' size should be either 4 or 8!
4121#endif
4122
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004123/* Scans a UTF-8 string and returns the maximum character to be expected,
4124 the size of the decoded unicode string and if any major errors were
4125 encountered.
4126
4127 This function does check basic UTF-8 sanity, it does however NOT CHECK
4128 if the string contains surrogates, and if all continuation bytes are
4129 within the correct ranges, these checks are performed in
4130 PyUnicode_DecodeUTF8Stateful.
4131
4132 If it sets has_errors to 1, it means the value of unicode_size and max_char
4133 will be bogus and you should not rely on useful information in them.
4134 */
4135static Py_UCS4
4136utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4137 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4138 int *has_errors)
4139{
4140 Py_ssize_t n;
4141 Py_ssize_t char_count = 0;
4142 Py_UCS4 max_char = 127, new_max;
4143 Py_UCS4 upper_bound;
4144 const unsigned char *p = (const unsigned char *)s;
4145 const unsigned char *end = p + string_size;
4146 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4147 int err = 0;
4148
4149 for (; p < end && !err; ++p, ++char_count) {
4150 /* Only check value if it's not a ASCII char... */
4151 if (*p < 0x80) {
4152 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4153 an explanation. */
4154 if (!((size_t) p & LONG_PTR_MASK)) {
4155 /* Help register allocation */
4156 register const unsigned char *_p = p;
4157 while (_p < aligned_end) {
4158 unsigned long value = *(unsigned long *) _p;
4159 if (value & ASCII_CHAR_MASK)
4160 break;
4161 _p += SIZEOF_LONG;
4162 char_count += SIZEOF_LONG;
4163 }
4164 p = _p;
4165 if (p == end)
4166 break;
4167 }
4168 }
4169 if (*p >= 0x80) {
4170 n = utf8_code_length[*p];
4171 new_max = max_char;
4172 switch (n) {
4173 /* invalid start byte */
4174 case 0:
4175 err = 1;
4176 break;
4177 case 2:
4178 /* Code points between 0x00FF and 0x07FF inclusive.
4179 Approximate the upper bound of the code point,
4180 if this flips over 255 we can be sure it will be more
4181 than 255 and the string will need 2 bytes per code coint,
4182 if it stays under or equal to 255, we can be sure 1 byte
4183 is enough.
4184 ((*p & 0b00011111) << 6) | 0b00111111 */
4185 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4186 if (max_char < upper_bound)
4187 new_max = upper_bound;
4188 /* Ensure we track at least that we left ASCII space. */
4189 if (new_max < 128)
4190 new_max = 128;
4191 break;
4192 case 3:
4193 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4194 always > 255 and <= 65535 and will always need 2 bytes. */
4195 if (max_char < 65535)
4196 new_max = 65535;
4197 break;
4198 case 4:
4199 /* Code point will be above 0xFFFF for sure in this case. */
4200 new_max = 65537;
4201 break;
4202 /* Internal error, this should be caught by the first if */
4203 case 1:
4204 default:
4205 assert(0 && "Impossible case in utf8_max_char_and_size");
4206 err = 1;
4207 }
4208 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004209 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210 --n;
4211 /* Check if the follow up chars are all valid continuation bytes */
4212 if (n >= 1) {
4213 const unsigned char *cont;
4214 if ((p + n) >= end) {
4215 if (consumed == 0)
4216 /* incomplete data, non-incremental decoding */
4217 err = 1;
4218 break;
4219 }
4220 for (cont = p + 1; cont < (p + n); ++cont) {
4221 if ((*cont & 0xc0) != 0x80) {
4222 err = 1;
4223 break;
4224 }
4225 }
4226 p += n;
4227 }
4228 else
4229 err = 1;
4230 max_char = new_max;
4231 }
4232 }
4233
4234 if (unicode_size)
4235 *unicode_size = char_count;
4236 if (has_errors)
4237 *has_errors = err;
4238 return max_char;
4239}
4240
4241/* Similar to PyUnicode_WRITE but can also write into wstr field
4242 of the legacy unicode representation */
4243#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4244 do { \
4245 const int k_ = (kind); \
4246 if (k_ == PyUnicode_WCHAR_KIND) \
4247 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4248 else if (k_ == PyUnicode_1BYTE_KIND) \
4249 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4250 else if (k_ == PyUnicode_2BYTE_KIND) \
4251 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4252 else \
4253 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4254 } while (0)
4255
Alexander Belopolsky40018472011-02-26 01:02:56 +00004256PyObject *
4257PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004258 Py_ssize_t size,
4259 const char *errors,
4260 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004261{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004262 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004264 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004265 Py_ssize_t startinpos;
4266 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004267 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004269 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004270 PyObject *errorHandler = NULL;
4271 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004272 Py_UCS4 maxchar = 0;
4273 Py_ssize_t unicode_size;
4274 Py_ssize_t i;
4275 int kind;
4276 void *data;
4277 int has_errors;
4278 Py_UNICODE *error_outptr;
4279#if SIZEOF_WCHAR_T == 2
4280 Py_ssize_t wchar_offset = 0;
4281#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282
Walter Dörwald69652032004-09-07 20:24:22 +00004283 if (size == 0) {
4284 if (consumed)
4285 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004286 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004288 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4289 consumed, &has_errors);
4290 if (has_errors) {
4291 unicode = _PyUnicode_New(size);
4292 if (!unicode)
4293 return NULL;
4294 kind = PyUnicode_WCHAR_KIND;
4295 data = PyUnicode_AS_UNICODE(unicode);
4296 assert(data != NULL);
4297 }
4298 else {
4299 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4300 if (!unicode)
4301 return NULL;
4302 /* When the string is ASCII only, just use memcpy and return.
4303 unicode_size may be != size if there is an incomplete UTF-8
4304 sequence at the end of the ASCII block. */
4305 if (maxchar < 128 && size == unicode_size) {
4306 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4307 return (PyObject *)unicode;
4308 }
4309 kind = PyUnicode_KIND(unicode);
4310 data = PyUnicode_DATA(unicode);
4311 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004313 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004315 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316
4317 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004318 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
4320 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004321 /* Fast path for runs of ASCII characters. Given that common UTF-8
4322 input will consist of an overwhelming majority of ASCII
4323 characters, we try to optimize for this case by checking
4324 as many characters as a C 'long' can contain.
4325 First, check if we can do an aligned read, as most CPUs have
4326 a penalty for unaligned reads.
4327 */
4328 if (!((size_t) s & LONG_PTR_MASK)) {
4329 /* Help register allocation */
4330 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004331 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004332 while (_s < aligned_end) {
4333 /* Read a whole long at a time (either 4 or 8 bytes),
4334 and do a fast unrolled copy if it only contains ASCII
4335 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004336 unsigned long value = *(unsigned long *) _s;
4337 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004338 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4340 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4341 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4342 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004343#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004344 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4345 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4346 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4347 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004348#endif
4349 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004350 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004351 }
4352 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004354 if (s == e)
4355 break;
4356 ch = (unsigned char)*s;
4357 }
4358 }
4359
4360 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004361 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362 s++;
4363 continue;
4364 }
4365
4366 n = utf8_code_length[ch];
4367
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004368 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004369 if (consumed)
4370 break;
4371 else {
4372 errmsg = "unexpected end of data";
4373 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004374 endinpos = startinpos+1;
4375 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4376 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004377 goto utf8Error;
4378 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380
4381 switch (n) {
4382
4383 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004384 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004385 startinpos = s-starts;
4386 endinpos = startinpos+1;
4387 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388
4389 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004390 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004391 startinpos = s-starts;
4392 endinpos = startinpos+1;
4393 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394
4395 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004396 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004397 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004398 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004399 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 goto utf8Error;
4401 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004402 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004403 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004404 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 break;
4406
4407 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004408 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4409 will result in surrogates in range d800-dfff. Surrogates are
4410 not valid UTF-8 so they are rejected.
4411 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4412 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004413 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004414 (s[2] & 0xc0) != 0x80 ||
4415 ((unsigned char)s[0] == 0xE0 &&
4416 (unsigned char)s[1] < 0xA0) ||
4417 ((unsigned char)s[0] == 0xED &&
4418 (unsigned char)s[1] > 0x9F)) {
4419 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004420 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004421 endinpos = startinpos + 1;
4422
4423 /* if s[1] first two bits are 1 and 0, then the invalid
4424 continuation byte is s[2], so increment endinpos by 1,
4425 if not, s[1] is invalid and endinpos doesn't need to
4426 be incremented. */
4427 if ((s[1] & 0xC0) == 0x80)
4428 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 goto utf8Error;
4430 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004432 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004433 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004434 break;
4435
4436 case 4:
4437 if ((s[1] & 0xc0) != 0x80 ||
4438 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004439 (s[3] & 0xc0) != 0x80 ||
4440 ((unsigned char)s[0] == 0xF0 &&
4441 (unsigned char)s[1] < 0x90) ||
4442 ((unsigned char)s[0] == 0xF4 &&
4443 (unsigned char)s[1] > 0x8F)) {
4444 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004445 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004446 endinpos = startinpos + 1;
4447 if ((s[1] & 0xC0) == 0x80) {
4448 endinpos++;
4449 if ((s[2] & 0xC0) == 0x80)
4450 endinpos++;
4451 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004452 goto utf8Error;
4453 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004454 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004455 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4456 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004458 /* If the string is flexible or we have native UCS-4, write
4459 directly.. */
4460 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4461 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004462
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004463 else {
4464 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004466 /* translate from 10000..10FFFF to 0..FFFF */
4467 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004469 /* high surrogate = top 10 bits added to D800 */
4470 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4471 (Py_UNICODE)(0xD800 + (ch >> 10)));
4472
4473 /* low surrogate = bottom 10 bits added to DC00 */
4474 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4475 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4476 }
4477#if SIZEOF_WCHAR_T == 2
4478 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004479#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 }
4482 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004484
Benjamin Peterson29060642009-01-31 22:14:21 +00004485 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004486 /* If this is not yet a resizable string, make it one.. */
4487 if (kind != PyUnicode_WCHAR_KIND) {
4488 const Py_UNICODE *u;
4489 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4490 if (!new_unicode)
4491 goto onError;
4492 u = PyUnicode_AsUnicode((PyObject *)unicode);
4493 if (!u)
4494 goto onError;
4495#if SIZEOF_WCHAR_T == 2
4496 i += wchar_offset;
4497#endif
4498 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4499 Py_DECREF(unicode);
4500 unicode = new_unicode;
4501 kind = 0;
4502 data = PyUnicode_AS_UNICODE(new_unicode);
4503 assert(data != NULL);
4504 }
4505 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 if (unicode_decode_call_errorhandler(
4507 errors, &errorHandler,
4508 "utf8", errmsg,
4509 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004510 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004511 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004512 /* Update data because unicode_decode_call_errorhandler might have
4513 re-created or resized the unicode object. */
4514 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004517 /* Ensure the unicode_size calculation above was correct: */
4518 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4519
Walter Dörwald69652032004-09-07 20:24:22 +00004520 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004521 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004523 /* Adjust length and ready string when it contained errors and
4524 is of the old resizable kind. */
4525 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004526 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004527 goto onError;
4528 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_XDECREF(errorHandler);
4531 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004532#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004533 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004534 Py_DECREF(unicode);
4535 return NULL;
4536 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004537#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004538 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 return (PyObject *)unicode;
4540
Benjamin Peterson29060642009-01-31 22:14:21 +00004541 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 Py_XDECREF(errorHandler);
4543 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004544 Py_DECREF(unicode);
4545 return NULL;
4546}
4547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004548#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004549
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004550#ifdef __APPLE__
4551
4552/* Simplified UTF-8 decoder using surrogateescape error handler,
4553 used to decode the command line arguments on Mac OS X. */
4554
4555wchar_t*
4556_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4557{
4558 int n;
4559 const char *e;
4560 wchar_t *unicode, *p;
4561
4562 /* Note: size will always be longer than the resulting Unicode
4563 character count */
4564 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4565 PyErr_NoMemory();
4566 return NULL;
4567 }
4568 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4569 if (!unicode)
4570 return NULL;
4571
4572 /* Unpack UTF-8 encoded data */
4573 p = unicode;
4574 e = s + size;
4575 while (s < e) {
4576 Py_UCS4 ch = (unsigned char)*s;
4577
4578 if (ch < 0x80) {
4579 *p++ = (wchar_t)ch;
4580 s++;
4581 continue;
4582 }
4583
4584 n = utf8_code_length[ch];
4585 if (s + n > e) {
4586 goto surrogateescape;
4587 }
4588
4589 switch (n) {
4590 case 0:
4591 case 1:
4592 goto surrogateescape;
4593
4594 case 2:
4595 if ((s[1] & 0xc0) != 0x80)
4596 goto surrogateescape;
4597 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4598 assert ((ch > 0x007F) && (ch <= 0x07FF));
4599 *p++ = (wchar_t)ch;
4600 break;
4601
4602 case 3:
4603 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4604 will result in surrogates in range d800-dfff. Surrogates are
4605 not valid UTF-8 so they are rejected.
4606 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4607 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4608 if ((s[1] & 0xc0) != 0x80 ||
4609 (s[2] & 0xc0) != 0x80 ||
4610 ((unsigned char)s[0] == 0xE0 &&
4611 (unsigned char)s[1] < 0xA0) ||
4612 ((unsigned char)s[0] == 0xED &&
4613 (unsigned char)s[1] > 0x9F)) {
4614
4615 goto surrogateescape;
4616 }
4617 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4618 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004619 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004620 break;
4621
4622 case 4:
4623 if ((s[1] & 0xc0) != 0x80 ||
4624 (s[2] & 0xc0) != 0x80 ||
4625 (s[3] & 0xc0) != 0x80 ||
4626 ((unsigned char)s[0] == 0xF0 &&
4627 (unsigned char)s[1] < 0x90) ||
4628 ((unsigned char)s[0] == 0xF4 &&
4629 (unsigned char)s[1] > 0x8F)) {
4630 goto surrogateescape;
4631 }
4632 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4633 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4634 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4635
4636#if SIZEOF_WCHAR_T == 4
4637 *p++ = (wchar_t)ch;
4638#else
4639 /* compute and append the two surrogates: */
4640
4641 /* translate from 10000..10FFFF to 0..FFFF */
4642 ch -= 0x10000;
4643
4644 /* high surrogate = top 10 bits added to D800 */
4645 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4646
4647 /* low surrogate = bottom 10 bits added to DC00 */
4648 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4649#endif
4650 break;
4651 }
4652 s += n;
4653 continue;
4654
4655 surrogateescape:
4656 *p++ = 0xDC00 + ch;
4657 s++;
4658 }
4659 *p = L'\0';
4660 return unicode;
4661}
4662
4663#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004664
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004665/* Primary internal function which creates utf8 encoded bytes objects.
4666
4667 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004668 and allocate exactly as much space needed at the end. Else allocate the
4669 maximum possible needed (4 result bytes per Unicode character), and return
4670 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004671*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004672PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674{
Tim Peters602f7402002-04-27 18:03:26 +00004675#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004676
Guido van Rossum98297ee2007-11-06 21:34:58 +00004677 Py_ssize_t i; /* index into s of next input byte */
4678 PyObject *result; /* result string object */
4679 char *p; /* next free byte in output buffer */
4680 Py_ssize_t nallocated; /* number of result bytes allocated */
4681 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004682 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004683 PyObject *errorHandler = NULL;
4684 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685 int kind;
4686 void *data;
4687 Py_ssize_t size;
4688 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4689#if SIZEOF_WCHAR_T == 2
4690 Py_ssize_t wchar_offset = 0;
4691#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004692
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004693 if (!PyUnicode_Check(unicode)) {
4694 PyErr_BadArgument();
4695 return NULL;
4696 }
4697
4698 if (PyUnicode_READY(unicode) == -1)
4699 return NULL;
4700
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004701 if (PyUnicode_UTF8(unicode))
4702 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4703 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004704
4705 kind = PyUnicode_KIND(unicode);
4706 data = PyUnicode_DATA(unicode);
4707 size = PyUnicode_GET_LENGTH(unicode);
4708
Tim Peters602f7402002-04-27 18:03:26 +00004709 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710
Tim Peters602f7402002-04-27 18:03:26 +00004711 if (size <= MAX_SHORT_UNICHARS) {
4712 /* Write into the stack buffer; nallocated can't overflow.
4713 * At the end, we'll allocate exactly as much heap space as it
4714 * turns out we need.
4715 */
4716 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004717 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004718 p = stackbuf;
4719 }
4720 else {
4721 /* Overallocate on the heap, and give the excess back at the end. */
4722 nallocated = size * 4;
4723 if (nallocated / 4 != size) /* overflow! */
4724 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004725 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004726 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004727 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004728 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004729 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004730
Tim Peters602f7402002-04-27 18:03:26 +00004731 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004732 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004733
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004734 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004735 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004737
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004739 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004740 *p++ = (char)(0xc0 | (ch >> 6));
4741 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004742 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004743 Py_ssize_t newpos;
4744 PyObject *rep;
4745 Py_ssize_t repsize, k, startpos;
4746 startpos = i-1;
4747#if SIZEOF_WCHAR_T == 2
4748 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004749#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004750 rep = unicode_encode_call_errorhandler(
4751 errors, &errorHandler, "utf-8", "surrogates not allowed",
4752 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4753 &exc, startpos, startpos+1, &newpos);
4754 if (!rep)
4755 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004757 if (PyBytes_Check(rep))
4758 repsize = PyBytes_GET_SIZE(rep);
4759 else
4760 repsize = PyUnicode_GET_SIZE(rep);
4761
4762 if (repsize > 4) {
4763 Py_ssize_t offset;
4764
4765 if (result == NULL)
4766 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004767 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004768 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004770 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4771 /* integer overflow */
4772 PyErr_NoMemory();
4773 goto error;
4774 }
4775 nallocated += repsize - 4;
4776 if (result != NULL) {
4777 if (_PyBytes_Resize(&result, nallocated) < 0)
4778 goto error;
4779 } else {
4780 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004781 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 goto error;
4783 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4784 }
4785 p = PyBytes_AS_STRING(result) + offset;
4786 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004788 if (PyBytes_Check(rep)) {
4789 char *prep = PyBytes_AS_STRING(rep);
4790 for(k = repsize; k > 0; k--)
4791 *p++ = *prep++;
4792 } else /* rep is unicode */ {
4793 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4794 Py_UNICODE c;
4795
4796 for(k=0; k<repsize; k++) {
4797 c = prep[k];
4798 if (0x80 <= c) {
4799 raise_encode_exception(&exc, "utf-8",
4800 PyUnicode_AS_UNICODE(unicode),
4801 size, i-1, i,
4802 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004803 goto error;
4804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004805 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004806 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004808 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004809 } else if (ch < 0x10000) {
4810 *p++ = (char)(0xe0 | (ch >> 12));
4811 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4812 *p++ = (char)(0x80 | (ch & 0x3f));
4813 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004814 /* Encode UCS4 Unicode ordinals */
4815 *p++ = (char)(0xf0 | (ch >> 18));
4816 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4817 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4818 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819#if SIZEOF_WCHAR_T == 2
4820 wchar_offset++;
4821#endif
Tim Peters602f7402002-04-27 18:03:26 +00004822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004824
Guido van Rossum98297ee2007-11-06 21:34:58 +00004825 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004826 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004827 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004828 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004829 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004830 }
4831 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004832 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004833 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004834 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004835 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004836 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004837
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004838 Py_XDECREF(errorHandler);
4839 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004840 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004841 error:
4842 Py_XDECREF(errorHandler);
4843 Py_XDECREF(exc);
4844 Py_XDECREF(result);
4845 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004846
Tim Peters602f7402002-04-27 18:03:26 +00004847#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848}
4849
Alexander Belopolsky40018472011-02-26 01:02:56 +00004850PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004851PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4852 Py_ssize_t size,
4853 const char *errors)
4854{
4855 PyObject *v, *unicode;
4856
4857 unicode = PyUnicode_FromUnicode(s, size);
4858 if (unicode == NULL)
4859 return NULL;
4860 v = _PyUnicode_AsUTF8String(unicode, errors);
4861 Py_DECREF(unicode);
4862 return v;
4863}
4864
4865PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004866PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869}
4870
Walter Dörwald41980ca2007-08-16 21:55:45 +00004871/* --- UTF-32 Codec ------------------------------------------------------- */
4872
4873PyObject *
4874PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004875 Py_ssize_t size,
4876 const char *errors,
4877 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004878{
4879 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4880}
4881
4882PyObject *
4883PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 Py_ssize_t size,
4885 const char *errors,
4886 int *byteorder,
4887 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004888{
4889 const char *starts = s;
4890 Py_ssize_t startinpos;
4891 Py_ssize_t endinpos;
4892 Py_ssize_t outpos;
4893 PyUnicodeObject *unicode;
4894 Py_UNICODE *p;
4895#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004896 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004897 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004898#else
4899 const int pairs = 0;
4900#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004901 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902 int bo = 0; /* assume native ordering by default */
4903 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004904 /* Offsets from q for retrieving bytes in the right order. */
4905#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4906 int iorder[] = {0, 1, 2, 3};
4907#else
4908 int iorder[] = {3, 2, 1, 0};
4909#endif
4910 PyObject *errorHandler = NULL;
4911 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004912
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913 q = (unsigned char *)s;
4914 e = q + size;
4915
4916 if (byteorder)
4917 bo = *byteorder;
4918
4919 /* Check for BOM marks (U+FEFF) in the input and adjust current
4920 byte order setting accordingly. In native mode, the leading BOM
4921 mark is skipped, in all other modes, it is copied to the output
4922 stream as-is (giving a ZWNBSP character). */
4923 if (bo == 0) {
4924 if (size >= 4) {
4925 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004926 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004927#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004928 if (bom == 0x0000FEFF) {
4929 q += 4;
4930 bo = -1;
4931 }
4932 else if (bom == 0xFFFE0000) {
4933 q += 4;
4934 bo = 1;
4935 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 if (bom == 0x0000FEFF) {
4938 q += 4;
4939 bo = 1;
4940 }
4941 else if (bom == 0xFFFE0000) {
4942 q += 4;
4943 bo = -1;
4944 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 }
4948
4949 if (bo == -1) {
4950 /* force LE */
4951 iorder[0] = 0;
4952 iorder[1] = 1;
4953 iorder[2] = 2;
4954 iorder[3] = 3;
4955 }
4956 else if (bo == 1) {
4957 /* force BE */
4958 iorder[0] = 3;
4959 iorder[1] = 2;
4960 iorder[2] = 1;
4961 iorder[3] = 0;
4962 }
4963
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004964 /* On narrow builds we split characters outside the BMP into two
4965 codepoints => count how much extra space we need. */
4966#ifndef Py_UNICODE_WIDE
4967 for (qq = q; qq < e; qq += 4)
4968 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4969 pairs++;
4970#endif
4971
4972 /* This might be one to much, because of a BOM */
4973 unicode = _PyUnicode_New((size+3)/4+pairs);
4974 if (!unicode)
4975 return NULL;
4976 if (size == 0)
4977 return (PyObject *)unicode;
4978
4979 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004980 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004981
Walter Dörwald41980ca2007-08-16 21:55:45 +00004982 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004983 Py_UCS4 ch;
4984 /* remaining bytes at the end? (size should be divisible by 4) */
4985 if (e-q<4) {
4986 if (consumed)
4987 break;
4988 errmsg = "truncated data";
4989 startinpos = ((const char *)q)-starts;
4990 endinpos = ((const char *)e)-starts;
4991 goto utf32Error;
4992 /* The remaining input chars are ignored if the callback
4993 chooses to skip the input */
4994 }
4995 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4996 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004997
Benjamin Peterson29060642009-01-31 22:14:21 +00004998 if (ch >= 0x110000)
4999 {
5000 errmsg = "codepoint not in range(0x110000)";
5001 startinpos = ((const char *)q)-starts;
5002 endinpos = startinpos+4;
5003 goto utf32Error;
5004 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005005#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005006 if (ch >= 0x10000)
5007 {
5008 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5009 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5010 }
5011 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005012#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 *p++ = ch;
5014 q += 4;
5015 continue;
5016 utf32Error:
5017 outpos = p-PyUnicode_AS_UNICODE(unicode);
5018 if (unicode_decode_call_errorhandler(
5019 errors, &errorHandler,
5020 "utf32", errmsg,
5021 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5022 &unicode, &outpos, &p))
5023 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024 }
5025
5026 if (byteorder)
5027 *byteorder = bo;
5028
5029 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005031
5032 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005033 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005034 goto onError;
5035
5036 Py_XDECREF(errorHandler);
5037 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005038#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005039 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005040 Py_DECREF(unicode);
5041 return NULL;
5042 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005043#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005044 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005045 return (PyObject *)unicode;
5046
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005048 Py_DECREF(unicode);
5049 Py_XDECREF(errorHandler);
5050 Py_XDECREF(exc);
5051 return NULL;
5052}
5053
5054PyObject *
5055PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 Py_ssize_t size,
5057 const char *errors,
5058 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005060 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005061 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005062 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005064 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065#else
5066 const int pairs = 0;
5067#endif
5068 /* Offsets from p for storing byte pairs in the right order. */
5069#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5070 int iorder[] = {0, 1, 2, 3};
5071#else
5072 int iorder[] = {3, 2, 1, 0};
5073#endif
5074
Benjamin Peterson29060642009-01-31 22:14:21 +00005075#define STORECHAR(CH) \
5076 do { \
5077 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5078 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5079 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5080 p[iorder[0]] = (CH) & 0xff; \
5081 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082 } while(0)
5083
5084 /* In narrow builds we can output surrogate pairs as one codepoint,
5085 so we need less space. */
5086#ifndef Py_UNICODE_WIDE
5087 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005088 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5089 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5090 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005092 nsize = (size - pairs + (byteorder == 0));
5093 bytesize = nsize * 4;
5094 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005095 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005096 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097 if (v == NULL)
5098 return NULL;
5099
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005100 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005101 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005102 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005104 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005105
5106 if (byteorder == -1) {
5107 /* force LE */
5108 iorder[0] = 0;
5109 iorder[1] = 1;
5110 iorder[2] = 2;
5111 iorder[3] = 3;
5112 }
5113 else if (byteorder == 1) {
5114 /* force BE */
5115 iorder[0] = 3;
5116 iorder[1] = 2;
5117 iorder[2] = 1;
5118 iorder[3] = 0;
5119 }
5120
5121 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005122 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005123#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005124 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5125 Py_UCS4 ch2 = *s;
5126 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5127 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5128 s++;
5129 size--;
5130 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005131 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132#endif
5133 STORECHAR(ch);
5134 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005135
5136 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005137 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005138#undef STORECHAR
5139}
5140
Alexander Belopolsky40018472011-02-26 01:02:56 +00005141PyObject *
5142PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143{
5144 if (!PyUnicode_Check(unicode)) {
5145 PyErr_BadArgument();
5146 return NULL;
5147 }
5148 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005149 PyUnicode_GET_SIZE(unicode),
5150 NULL,
5151 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152}
5153
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154/* --- UTF-16 Codec ------------------------------------------------------- */
5155
Tim Peters772747b2001-08-09 22:21:55 +00005156PyObject *
5157PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 Py_ssize_t size,
5159 const char *errors,
5160 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
Walter Dörwald69652032004-09-07 20:24:22 +00005162 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5163}
5164
Antoine Pitrouab868312009-01-10 15:40:25 +00005165/* Two masks for fast checking of whether a C 'long' may contain
5166 UTF16-encoded surrogate characters. This is an efficient heuristic,
5167 assuming that non-surrogate characters with a code point >= 0x8000 are
5168 rare in most input.
5169 FAST_CHAR_MASK is used when the input is in native byte ordering,
5170 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005171*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005172#if (SIZEOF_LONG == 8)
5173# define FAST_CHAR_MASK 0x8000800080008000L
5174# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5175#elif (SIZEOF_LONG == 4)
5176# define FAST_CHAR_MASK 0x80008000L
5177# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5178#else
5179# error C 'long' size should be either 4 or 8!
5180#endif
5181
Walter Dörwald69652032004-09-07 20:24:22 +00005182PyObject *
5183PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 Py_ssize_t size,
5185 const char *errors,
5186 int *byteorder,
5187 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005188{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005189 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005190 Py_ssize_t startinpos;
5191 Py_ssize_t endinpos;
5192 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 PyUnicodeObject *unicode;
5194 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005195 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005196 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005197 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005198 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005199 /* Offsets from q for retrieving byte pairs in the right order. */
5200#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5201 int ihi = 1, ilo = 0;
5202#else
5203 int ihi = 0, ilo = 1;
5204#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005205 PyObject *errorHandler = NULL;
5206 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207
5208 /* Note: size will always be longer than the resulting Unicode
5209 character count */
5210 unicode = _PyUnicode_New(size);
5211 if (!unicode)
5212 return NULL;
5213 if (size == 0)
5214 return (PyObject *)unicode;
5215
5216 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005217 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005218 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005219 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
5221 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005222 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005224 /* Check for BOM marks (U+FEFF) in the input and adjust current
5225 byte order setting accordingly. In native mode, the leading BOM
5226 mark is skipped, in all other modes, it is copied to the output
5227 stream as-is (giving a ZWNBSP character). */
5228 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005229 if (size >= 2) {
5230 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005231#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005232 if (bom == 0xFEFF) {
5233 q += 2;
5234 bo = -1;
5235 }
5236 else if (bom == 0xFFFE) {
5237 q += 2;
5238 bo = 1;
5239 }
Tim Petersced69f82003-09-16 20:30:58 +00005240#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 if (bom == 0xFEFF) {
5242 q += 2;
5243 bo = 1;
5244 }
5245 else if (bom == 0xFFFE) {
5246 q += 2;
5247 bo = -1;
5248 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005249#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
Tim Peters772747b2001-08-09 22:21:55 +00005253 if (bo == -1) {
5254 /* force LE */
5255 ihi = 1;
5256 ilo = 0;
5257 }
5258 else if (bo == 1) {
5259 /* force BE */
5260 ihi = 0;
5261 ilo = 1;
5262 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005263#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5264 native_ordering = ilo < ihi;
5265#else
5266 native_ordering = ilo > ihi;
5267#endif
Tim Peters772747b2001-08-09 22:21:55 +00005268
Antoine Pitrouab868312009-01-10 15:40:25 +00005269 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005270 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005271 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005272 /* First check for possible aligned read of a C 'long'. Unaligned
5273 reads are more expensive, better to defer to another iteration. */
5274 if (!((size_t) q & LONG_PTR_MASK)) {
5275 /* Fast path for runs of non-surrogate chars. */
5276 register const unsigned char *_q = q;
5277 Py_UNICODE *_p = p;
5278 if (native_ordering) {
5279 /* Native ordering is simple: as long as the input cannot
5280 possibly contain a surrogate char, do an unrolled copy
5281 of several 16-bit code points to the target object.
5282 The non-surrogate check is done on several input bytes
5283 at a time (as many as a C 'long' can contain). */
5284 while (_q < aligned_end) {
5285 unsigned long data = * (unsigned long *) _q;
5286 if (data & FAST_CHAR_MASK)
5287 break;
5288 _p[0] = ((unsigned short *) _q)[0];
5289 _p[1] = ((unsigned short *) _q)[1];
5290#if (SIZEOF_LONG == 8)
5291 _p[2] = ((unsigned short *) _q)[2];
5292 _p[3] = ((unsigned short *) _q)[3];
5293#endif
5294 _q += SIZEOF_LONG;
5295 _p += SIZEOF_LONG / 2;
5296 }
5297 }
5298 else {
5299 /* Byteswapped ordering is similar, but we must decompose
5300 the copy bytewise, and take care of zero'ing out the
5301 upper bytes if the target object is in 32-bit units
5302 (that is, in UCS-4 builds). */
5303 while (_q < aligned_end) {
5304 unsigned long data = * (unsigned long *) _q;
5305 if (data & SWAPPED_FAST_CHAR_MASK)
5306 break;
5307 /* Zero upper bytes in UCS-4 builds */
5308#if (Py_UNICODE_SIZE > 2)
5309 _p[0] = 0;
5310 _p[1] = 0;
5311#if (SIZEOF_LONG == 8)
5312 _p[2] = 0;
5313 _p[3] = 0;
5314#endif
5315#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005316 /* Issue #4916; UCS-4 builds on big endian machines must
5317 fill the two last bytes of each 4-byte unit. */
5318#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5319# define OFF 2
5320#else
5321# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005322#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005323 ((unsigned char *) _p)[OFF + 1] = _q[0];
5324 ((unsigned char *) _p)[OFF + 0] = _q[1];
5325 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5326 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5327#if (SIZEOF_LONG == 8)
5328 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5329 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5330 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5331 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5332#endif
5333#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005334 _q += SIZEOF_LONG;
5335 _p += SIZEOF_LONG / 2;
5336 }
5337 }
5338 p = _p;
5339 q = _q;
5340 if (q >= e)
5341 break;
5342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005343 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005344
Benjamin Peterson14339b62009-01-31 16:36:08 +00005345 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005346
5347 if (ch < 0xD800 || ch > 0xDFFF) {
5348 *p++ = ch;
5349 continue;
5350 }
5351
5352 /* UTF-16 code pair: */
5353 if (q > e) {
5354 errmsg = "unexpected end of data";
5355 startinpos = (((const char *)q) - 2) - starts;
5356 endinpos = ((const char *)e) + 1 - starts;
5357 goto utf16Error;
5358 }
5359 if (0xD800 <= ch && ch <= 0xDBFF) {
5360 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5361 q += 2;
5362 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005363#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005364 *p++ = ch;
5365 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005366#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005368#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005369 continue;
5370 }
5371 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005372 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 startinpos = (((const char *)q)-4)-starts;
5374 endinpos = startinpos+2;
5375 goto utf16Error;
5376 }
5377
Benjamin Peterson14339b62009-01-31 16:36:08 +00005378 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 errmsg = "illegal encoding";
5380 startinpos = (((const char *)q)-2)-starts;
5381 endinpos = startinpos+2;
5382 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005383
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 utf16Error:
5385 outpos = p - PyUnicode_AS_UNICODE(unicode);
5386 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005387 errors,
5388 &errorHandler,
5389 "utf16", errmsg,
5390 &starts,
5391 (const char **)&e,
5392 &startinpos,
5393 &endinpos,
5394 &exc,
5395 (const char **)&q,
5396 &unicode,
5397 &outpos,
5398 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005399 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005401 /* remaining byte at the end? (size should be even) */
5402 if (e == q) {
5403 if (!consumed) {
5404 errmsg = "truncated data";
5405 startinpos = ((const char *)q) - starts;
5406 endinpos = ((const char *)e) + 1 - starts;
5407 outpos = p - PyUnicode_AS_UNICODE(unicode);
5408 if (unicode_decode_call_errorhandler(
5409 errors,
5410 &errorHandler,
5411 "utf16", errmsg,
5412 &starts,
5413 (const char **)&e,
5414 &startinpos,
5415 &endinpos,
5416 &exc,
5417 (const char **)&q,
5418 &unicode,
5419 &outpos,
5420 &p))
5421 goto onError;
5422 /* The remaining input chars are ignored if the callback
5423 chooses to skip the input */
5424 }
5425 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
5427 if (byteorder)
5428 *byteorder = bo;
5429
Walter Dörwald69652032004-09-07 20:24:22 +00005430 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005431 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005432
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005434 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 goto onError;
5436
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005437 Py_XDECREF(errorHandler);
5438 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005439#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005440 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005441 Py_DECREF(unicode);
5442 return NULL;
5443 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005444#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005445 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 return (PyObject *)unicode;
5447
Benjamin Peterson29060642009-01-31 22:14:21 +00005448 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005450 Py_XDECREF(errorHandler);
5451 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 return NULL;
5453}
5454
Antoine Pitrouab868312009-01-10 15:40:25 +00005455#undef FAST_CHAR_MASK
5456#undef SWAPPED_FAST_CHAR_MASK
5457
Tim Peters772747b2001-08-09 22:21:55 +00005458PyObject *
5459PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 Py_ssize_t size,
5461 const char *errors,
5462 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005464 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005465 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005466 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005467#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005468 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005469#else
5470 const int pairs = 0;
5471#endif
Tim Peters772747b2001-08-09 22:21:55 +00005472 /* Offsets from p for storing byte pairs in the right order. */
5473#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5474 int ihi = 1, ilo = 0;
5475#else
5476 int ihi = 0, ilo = 1;
5477#endif
5478
Benjamin Peterson29060642009-01-31 22:14:21 +00005479#define STORECHAR(CH) \
5480 do { \
5481 p[ihi] = ((CH) >> 8) & 0xff; \
5482 p[ilo] = (CH) & 0xff; \
5483 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005484 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005486#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005487 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 if (s[i] >= 0x10000)
5489 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005490#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005491 /* 2 * (size + pairs + (byteorder == 0)) */
5492 if (size > PY_SSIZE_T_MAX ||
5493 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005495 nsize = size + pairs + (byteorder == 0);
5496 bytesize = nsize * 2;
5497 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005499 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 if (v == NULL)
5501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005503 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005506 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005507 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005508
5509 if (byteorder == -1) {
5510 /* force LE */
5511 ihi = 1;
5512 ilo = 0;
5513 }
5514 else if (byteorder == 1) {
5515 /* force BE */
5516 ihi = 0;
5517 ilo = 1;
5518 }
5519
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005520 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 Py_UNICODE ch = *s++;
5522 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005523#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005524 if (ch >= 0x10000) {
5525 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5526 ch = 0xD800 | ((ch-0x10000) >> 10);
5527 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005528#endif
Tim Peters772747b2001-08-09 22:21:55 +00005529 STORECHAR(ch);
5530 if (ch2)
5531 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005532 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005533
5534 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005535 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005536#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005537}
5538
Alexander Belopolsky40018472011-02-26 01:02:56 +00005539PyObject *
5540PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541{
5542 if (!PyUnicode_Check(unicode)) {
5543 PyErr_BadArgument();
5544 return NULL;
5545 }
5546 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005547 PyUnicode_GET_SIZE(unicode),
5548 NULL,
5549 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550}
5551
5552/* --- Unicode Escape Codec ----------------------------------------------- */
5553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005554/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5555 if all the escapes in the string make it still a valid ASCII string.
5556 Returns -1 if any escapes were found which cause the string to
5557 pop out of ASCII range. Otherwise returns the length of the
5558 required buffer to hold the string.
5559 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005560static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005561length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5562{
5563 const unsigned char *p = (const unsigned char *)s;
5564 const unsigned char *end = p + size;
5565 Py_ssize_t length = 0;
5566
5567 if (size < 0)
5568 return -1;
5569
5570 for (; p < end; ++p) {
5571 if (*p > 127) {
5572 /* Non-ASCII */
5573 return -1;
5574 }
5575 else if (*p != '\\') {
5576 /* Normal character */
5577 ++length;
5578 }
5579 else {
5580 /* Backslash-escape, check next char */
5581 ++p;
5582 /* Escape sequence reaches till end of string or
5583 non-ASCII follow-up. */
5584 if (p >= end || *p > 127)
5585 return -1;
5586 switch (*p) {
5587 case '\n':
5588 /* backslash + \n result in zero characters */
5589 break;
5590 case '\\': case '\'': case '\"':
5591 case 'b': case 'f': case 't':
5592 case 'n': case 'r': case 'v': case 'a':
5593 ++length;
5594 break;
5595 case '0': case '1': case '2': case '3':
5596 case '4': case '5': case '6': case '7':
5597 case 'x': case 'u': case 'U': case 'N':
5598 /* these do not guarantee ASCII characters */
5599 return -1;
5600 default:
5601 /* count the backslash + the other character */
5602 length += 2;
5603 }
5604 }
5605 }
5606 return length;
5607}
5608
5609/* Similar to PyUnicode_WRITE but either write into wstr field
5610 or treat string as ASCII. */
5611#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5612 do { \
5613 if ((kind) != PyUnicode_WCHAR_KIND) \
5614 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5615 else \
5616 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5617 } while (0)
5618
5619#define WRITE_WSTR(buf, index, value) \
5620 assert(kind == PyUnicode_WCHAR_KIND), \
5621 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5622
5623
Fredrik Lundh06d12682001-01-24 07:59:11 +00005624static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005625
Alexander Belopolsky40018472011-02-26 01:02:56 +00005626PyObject *
5627PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005628 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005629 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005631 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005632 Py_ssize_t startinpos;
5633 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005634 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005636 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005638 char* message;
5639 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 PyObject *errorHandler = NULL;
5641 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 Py_ssize_t ascii_length;
5643 Py_ssize_t i;
5644 int kind;
5645 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005646
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005647 ascii_length = length_of_escaped_ascii_string(s, size);
5648
5649 /* After length_of_escaped_ascii_string() there are two alternatives,
5650 either the string is pure ASCII with named escapes like \n, etc.
5651 and we determined it's exact size (common case)
5652 or it contains \x, \u, ... escape sequences. then we create a
5653 legacy wchar string and resize it at the end of this function. */
5654 if (ascii_length >= 0) {
5655 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5656 if (!v)
5657 goto onError;
5658 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5659 kind = PyUnicode_1BYTE_KIND;
5660 data = PyUnicode_DATA(v);
5661 }
5662 else {
5663 /* Escaped strings will always be longer than the resulting
5664 Unicode string, so we start with size here and then reduce the
5665 length after conversion to the true value.
5666 (but if the error callback returns a long replacement string
5667 we'll have to allocate more space) */
5668 v = _PyUnicode_New(size);
5669 if (!v)
5670 goto onError;
5671 kind = PyUnicode_WCHAR_KIND;
5672 data = PyUnicode_AS_UNICODE(v);
5673 }
5674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (size == 0)
5676 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005677 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005679
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 while (s < end) {
5681 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005682 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 if (kind == PyUnicode_WCHAR_KIND) {
5686 assert(i < _PyUnicode_WSTR_LENGTH(v));
5687 }
5688 else {
5689 /* The only case in which i == ascii_length is a backslash
5690 followed by a newline. */
5691 assert(i <= ascii_length);
5692 }
5693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694 /* Non-escape characters are interpreted as Unicode ordinals */
5695 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 continue;
5698 }
5699
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005700 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701 /* \ - Escapes */
5702 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005703 c = *s++;
5704 if (s > end)
5705 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005706
5707 if (kind == PyUnicode_WCHAR_KIND) {
5708 assert(i < _PyUnicode_WSTR_LENGTH(v));
5709 }
5710 else {
5711 /* The only case in which i == ascii_length is a backslash
5712 followed by a newline. */
5713 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5714 }
5715
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005716 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
Benjamin Peterson29060642009-01-31 22:14:21 +00005718 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005720 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5721 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5722 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5723 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5724 /* FF */
5725 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5726 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5727 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5728 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5729 /* VT */
5730 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5731 /* BEL, not classic C */
5732 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Benjamin Peterson29060642009-01-31 22:14:21 +00005734 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735 case '0': case '1': case '2': case '3':
5736 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005737 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005738 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005739 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005740 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005741 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005743 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 break;
5745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* hex escapes */
5747 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005749 digits = 2;
5750 message = "truncated \\xXX escape";
5751 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005752
Benjamin Peterson29060642009-01-31 22:14:21 +00005753 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005755 digits = 4;
5756 message = "truncated \\uXXXX escape";
5757 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005758
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005760 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005761 digits = 8;
5762 message = "truncated \\UXXXXXXXX escape";
5763 hexescape:
5764 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005765 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005766 if (s+digits>end) {
5767 endinpos = size;
5768 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005769 errors, &errorHandler,
5770 "unicodeescape", "end of string in escape sequence",
5771 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005772 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005773 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 goto nextByte;
5776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 for (j = 0; j < digits; ++j) {
5778 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005779 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005780 endinpos = (s+j+1)-starts;
5781 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 errors, &errorHandler,
5784 "unicodeescape", message,
5785 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005787 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005788 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005789 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005790 }
5791 chr = (chr<<4) & ~0xF;
5792 if (c >= '0' && c <= '9')
5793 chr += c - '0';
5794 else if (c >= 'a' && c <= 'f')
5795 chr += 10 + c - 'a';
5796 else
5797 chr += 10 + c - 'A';
5798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005799 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005800 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 /* _decoding_error will have already written into the
5802 target buffer. */
5803 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005804 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005805 /* when we get here, chr is a 32-bit unicode character */
5806 if (chr <= 0xffff)
5807 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005808 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005809 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005810 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005811 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005812#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005813 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005814#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005815 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5817 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005818#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005819 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005820 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005821 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005822 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005823 errors, &errorHandler,
5824 "unicodeescape", "illegal Unicode character",
5825 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005827 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005829 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005830 break;
5831
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 case 'N':
5834 message = "malformed \\N character escape";
5835 if (ucnhash_CAPI == NULL) {
5836 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5838 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 if (ucnhash_CAPI == NULL)
5840 goto ucnhashError;
5841 }
5842 if (*s == '{') {
5843 const char *start = s+1;
5844 /* look for the closing brace */
5845 while (*s != '}' && s < end)
5846 s++;
5847 if (s > start && s < end && *s == '}') {
5848 /* found a name. look it up in the unicode database */
5849 message = "unknown Unicode character name";
5850 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005851 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5852 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005853 goto store;
5854 }
5855 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005856 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005857 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005858 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 errors, &errorHandler,
5860 "unicodeescape", message,
5861 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005862 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005863 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005864 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005865 break;
5866
5867 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005868 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005869 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 message = "\\ at end of string";
5871 s--;
5872 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005873 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005875 errors, &errorHandler,
5876 "unicodeescape", message,
5877 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005878 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005879 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005880 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005881 }
5882 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005883 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5884 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005885 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005886 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005888 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005889 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005891 /* Ensure the length prediction worked in case of ASCII strings */
5892 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5893
Victor Stinnerfe226c02011-10-03 03:52:20 +02005894 if (kind == PyUnicode_WCHAR_KIND)
5895 {
5896 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5897 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005898 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005899 Py_XDECREF(errorHandler);
5900 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005901#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005902 if (_PyUnicode_READY_REPLACE(&v)) {
5903 Py_DECREF(v);
5904 return NULL;
5905 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005906#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005907 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005909
Benjamin Peterson29060642009-01-31 22:14:21 +00005910 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005911 PyErr_SetString(
5912 PyExc_UnicodeError,
5913 "\\N escapes not supported (can't load unicodedata module)"
5914 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005915 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916 Py_XDECREF(errorHandler);
5917 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005918 return NULL;
5919
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005922 Py_XDECREF(errorHandler);
5923 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 return NULL;
5925}
5926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005927#undef WRITE_ASCII_OR_WSTR
5928#undef WRITE_WSTR
5929
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930/* Return a Unicode-Escape string version of the Unicode object.
5931
5932 If quotes is true, the string is enclosed in u"" or u'' quotes as
5933 appropriate.
5934
5935*/
5936
Walter Dörwald79e913e2007-05-12 11:08:06 +00005937static const char *hexdigits = "0123456789abcdef";
5938
Alexander Belopolsky40018472011-02-26 01:02:56 +00005939PyObject *
5940PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005941 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005943 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005946#ifdef Py_UNICODE_WIDE
5947 const Py_ssize_t expandsize = 10;
5948#else
5949 const Py_ssize_t expandsize = 6;
5950#endif
5951
Thomas Wouters89f507f2006-12-13 04:49:30 +00005952 /* XXX(nnorwitz): rather than over-allocating, it would be
5953 better to choose a different scheme. Perhaps scan the
5954 first N-chars of the string and allocate based on that size.
5955 */
5956 /* Initial allocation is based on the longest-possible unichr
5957 escape.
5958
5959 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5960 unichr, so in this case it's the longest unichr escape. In
5961 narrow (UTF-16) builds this is five chars per source unichr
5962 since there are two unichrs in the surrogate pair, so in narrow
5963 (UTF-16) builds it's not the longest unichr escape.
5964
5965 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5966 so in the narrow (UTF-16) build case it's the longest unichr
5967 escape.
5968 */
5969
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005970 if (size == 0)
5971 return PyBytes_FromStringAndSize(NULL, 0);
5972
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005973 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005974 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005975
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005976 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005977 2
5978 + expandsize*size
5979 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980 if (repr == NULL)
5981 return NULL;
5982
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005983 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984
Guido van Rossumd57fd912000-03-10 22:53:23 +00005985 while (size-- > 0) {
5986 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005987
Walter Dörwald79e913e2007-05-12 11:08:06 +00005988 /* Escape backslashes */
5989 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 *p++ = '\\';
5991 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005992 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005994
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005995#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005996 /* Map 21-bit characters to '\U00xxxxxx' */
5997 else if (ch >= 0x10000) {
5998 *p++ = '\\';
5999 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006000 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6001 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6002 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6003 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6004 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6005 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6006 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6007 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006008 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006009 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006010#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006011 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6012 else if (ch >= 0xD800 && ch < 0xDC00) {
6013 Py_UNICODE ch2;
6014 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006015
Benjamin Peterson29060642009-01-31 22:14:21 +00006016 ch2 = *s++;
6017 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006018 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6020 *p++ = '\\';
6021 *p++ = 'U';
6022 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6023 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6024 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6025 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6026 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6027 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6028 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6029 *p++ = hexdigits[ucs & 0x0000000F];
6030 continue;
6031 }
6032 /* Fall through: isolated surrogates are copied as-is */
6033 s--;
6034 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006035 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006036#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006037
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006039 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040 *p++ = '\\';
6041 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006042 *p++ = hexdigits[(ch >> 12) & 0x000F];
6043 *p++ = hexdigits[(ch >> 8) & 0x000F];
6044 *p++ = hexdigits[(ch >> 4) & 0x000F];
6045 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006047
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006048 /* Map special whitespace to '\t', \n', '\r' */
6049 else if (ch == '\t') {
6050 *p++ = '\\';
6051 *p++ = 't';
6052 }
6053 else if (ch == '\n') {
6054 *p++ = '\\';
6055 *p++ = 'n';
6056 }
6057 else if (ch == '\r') {
6058 *p++ = '\\';
6059 *p++ = 'r';
6060 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006061
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006062 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006063 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006064 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006065 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006066 *p++ = hexdigits[(ch >> 4) & 0x000F];
6067 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006068 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006069
Guido van Rossumd57fd912000-03-10 22:53:23 +00006070 /* Copy everything else as-is */
6071 else
6072 *p++ = (char) ch;
6073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006075 assert(p - PyBytes_AS_STRING(repr) > 0);
6076 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6077 return NULL;
6078 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079}
6080
Alexander Belopolsky40018472011-02-26 01:02:56 +00006081PyObject *
6082PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006084 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 if (!PyUnicode_Check(unicode)) {
6086 PyErr_BadArgument();
6087 return NULL;
6088 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006089 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6090 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006091 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092}
6093
6094/* --- Raw Unicode Escape Codec ------------------------------------------- */
6095
Alexander Belopolsky40018472011-02-26 01:02:56 +00006096PyObject *
6097PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006098 Py_ssize_t size,
6099 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006100{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006101 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006102 Py_ssize_t startinpos;
6103 Py_ssize_t endinpos;
6104 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107 const char *end;
6108 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006109 PyObject *errorHandler = NULL;
6110 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006111
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 /* Escaped strings will always be longer than the resulting
6113 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006114 length after conversion to the true value. (But decoding error
6115 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 v = _PyUnicode_New(size);
6117 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006118 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006121 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 end = s + size;
6123 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 unsigned char c;
6125 Py_UCS4 x;
6126 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006127 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 /* Non-escape characters are interpreted as Unicode ordinals */
6130 if (*s != '\\') {
6131 *p++ = (unsigned char)*s++;
6132 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006133 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 startinpos = s-starts;
6135
6136 /* \u-escapes are only interpreted iff the number of leading
6137 backslashes if odd */
6138 bs = s;
6139 for (;s < end;) {
6140 if (*s != '\\')
6141 break;
6142 *p++ = (unsigned char)*s++;
6143 }
6144 if (((s - bs) & 1) == 0 ||
6145 s >= end ||
6146 (*s != 'u' && *s != 'U')) {
6147 continue;
6148 }
6149 p--;
6150 count = *s=='u' ? 4 : 8;
6151 s++;
6152
6153 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6154 outpos = p-PyUnicode_AS_UNICODE(v);
6155 for (x = 0, i = 0; i < count; ++i, ++s) {
6156 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006157 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 endinpos = s-starts;
6159 if (unicode_decode_call_errorhandler(
6160 errors, &errorHandler,
6161 "rawunicodeescape", "truncated \\uXXXX",
6162 &starts, &end, &startinpos, &endinpos, &exc, &s,
6163 &v, &outpos, &p))
6164 goto onError;
6165 goto nextByte;
6166 }
6167 x = (x<<4) & ~0xF;
6168 if (c >= '0' && c <= '9')
6169 x += c - '0';
6170 else if (c >= 'a' && c <= 'f')
6171 x += 10 + c - 'a';
6172 else
6173 x += 10 + c - 'A';
6174 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006175 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 /* UCS-2 character */
6177 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006178 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006179 /* UCS-4 character. Either store directly, or as
6180 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006181#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006183#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006184 x -= 0x10000L;
6185 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6186 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006187#endif
6188 } else {
6189 endinpos = s-starts;
6190 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006191 if (unicode_decode_call_errorhandler(
6192 errors, &errorHandler,
6193 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 &starts, &end, &startinpos, &endinpos, &exc, &s,
6195 &v, &outpos, &p))
6196 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006197 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006198 nextByte:
6199 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006201 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006202 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006203 Py_XDECREF(errorHandler);
6204 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006205#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006206 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006207 Py_DECREF(v);
6208 return NULL;
6209 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006210#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006211 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006213
Benjamin Peterson29060642009-01-31 22:14:21 +00006214 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006216 Py_XDECREF(errorHandler);
6217 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218 return NULL;
6219}
6220
Alexander Belopolsky40018472011-02-26 01:02:56 +00006221PyObject *
6222PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006223 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006225 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 char *p;
6227 char *q;
6228
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006229#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006230 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006231#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006232 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006233#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006234
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006235 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006236 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006237
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006238 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 if (repr == NULL)
6240 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006241 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006242 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006244 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 while (size-- > 0) {
6246 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006247#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006248 /* Map 32-bit characters to '\Uxxxxxxxx' */
6249 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006250 *p++ = '\\';
6251 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006252 *p++ = hexdigits[(ch >> 28) & 0xf];
6253 *p++ = hexdigits[(ch >> 24) & 0xf];
6254 *p++ = hexdigits[(ch >> 20) & 0xf];
6255 *p++ = hexdigits[(ch >> 16) & 0xf];
6256 *p++ = hexdigits[(ch >> 12) & 0xf];
6257 *p++ = hexdigits[(ch >> 8) & 0xf];
6258 *p++ = hexdigits[(ch >> 4) & 0xf];
6259 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006260 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006261 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006262#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6264 if (ch >= 0xD800 && ch < 0xDC00) {
6265 Py_UNICODE ch2;
6266 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006267
Benjamin Peterson29060642009-01-31 22:14:21 +00006268 ch2 = *s++;
6269 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006270 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006271 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6272 *p++ = '\\';
6273 *p++ = 'U';
6274 *p++ = hexdigits[(ucs >> 28) & 0xf];
6275 *p++ = hexdigits[(ucs >> 24) & 0xf];
6276 *p++ = hexdigits[(ucs >> 20) & 0xf];
6277 *p++ = hexdigits[(ucs >> 16) & 0xf];
6278 *p++ = hexdigits[(ucs >> 12) & 0xf];
6279 *p++ = hexdigits[(ucs >> 8) & 0xf];
6280 *p++ = hexdigits[(ucs >> 4) & 0xf];
6281 *p++ = hexdigits[ucs & 0xf];
6282 continue;
6283 }
6284 /* Fall through: isolated surrogates are copied as-is */
6285 s--;
6286 size++;
6287 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006288#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 /* Map 16-bit characters to '\uxxxx' */
6290 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 *p++ = '\\';
6292 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006293 *p++ = hexdigits[(ch >> 12) & 0xf];
6294 *p++ = hexdigits[(ch >> 8) & 0xf];
6295 *p++ = hexdigits[(ch >> 4) & 0xf];
6296 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 /* Copy everything else as-is */
6299 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 *p++ = (char) ch;
6301 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006302 size = p - q;
6303
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006304 assert(size > 0);
6305 if (_PyBytes_Resize(&repr, size) < 0)
6306 return NULL;
6307 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308}
6309
Alexander Belopolsky40018472011-02-26 01:02:56 +00006310PyObject *
6311PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006313 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006315 PyErr_BadArgument();
6316 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006318 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6319 PyUnicode_GET_SIZE(unicode));
6320
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006321 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006324/* --- Unicode Internal Codec ------------------------------------------- */
6325
Alexander Belopolsky40018472011-02-26 01:02:56 +00006326PyObject *
6327_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006328 Py_ssize_t size,
6329 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006330{
6331 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006332 Py_ssize_t startinpos;
6333 Py_ssize_t endinpos;
6334 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006335 PyUnicodeObject *v;
6336 Py_UNICODE *p;
6337 const char *end;
6338 const char *reason;
6339 PyObject *errorHandler = NULL;
6340 PyObject *exc = NULL;
6341
Neal Norwitzd43069c2006-01-08 01:12:10 +00006342#ifdef Py_UNICODE_WIDE
6343 Py_UNICODE unimax = PyUnicode_GetMax();
6344#endif
6345
Thomas Wouters89f507f2006-12-13 04:49:30 +00006346 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006347 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6348 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006350 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6351 as string was created with the old API. */
6352 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006353 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006354 p = PyUnicode_AS_UNICODE(v);
6355 end = s + size;
6356
6357 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006358 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006359 /* We have to sanity check the raw data, otherwise doom looms for
6360 some malformed UCS-4 data. */
6361 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006362#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006363 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006364#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006365 end-s < Py_UNICODE_SIZE
6366 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006367 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006368 startinpos = s - starts;
6369 if (end-s < Py_UNICODE_SIZE) {
6370 endinpos = end-starts;
6371 reason = "truncated input";
6372 }
6373 else {
6374 endinpos = s - starts + Py_UNICODE_SIZE;
6375 reason = "illegal code point (> 0x10FFFF)";
6376 }
6377 outpos = p - PyUnicode_AS_UNICODE(v);
6378 if (unicode_decode_call_errorhandler(
6379 errors, &errorHandler,
6380 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006381 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006382 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006383 goto onError;
6384 }
6385 }
6386 else {
6387 p++;
6388 s += Py_UNICODE_SIZE;
6389 }
6390 }
6391
Victor Stinnerfe226c02011-10-03 03:52:20 +02006392 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006393 goto onError;
6394 Py_XDECREF(errorHandler);
6395 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006396#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006397 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006398 Py_DECREF(v);
6399 return NULL;
6400 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006401#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006402 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006403 return (PyObject *)v;
6404
Benjamin Peterson29060642009-01-31 22:14:21 +00006405 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006406 Py_XDECREF(v);
6407 Py_XDECREF(errorHandler);
6408 Py_XDECREF(exc);
6409 return NULL;
6410}
6411
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412/* --- Latin-1 Codec ------------------------------------------------------ */
6413
Alexander Belopolsky40018472011-02-26 01:02:56 +00006414PyObject *
6415PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006416 Py_ssize_t size,
6417 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006420 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421}
6422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424static void
6425make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006426 const char *encoding,
6427 const Py_UNICODE *unicode, Py_ssize_t size,
6428 Py_ssize_t startpos, Py_ssize_t endpos,
6429 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006432 *exceptionObject = PyUnicodeEncodeError_Create(
6433 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 }
6435 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006436 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6437 goto onError;
6438 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6439 goto onError;
6440 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6441 goto onError;
6442 return;
6443 onError:
6444 Py_DECREF(*exceptionObject);
6445 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006446 }
6447}
6448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006450static void
6451raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006452 const char *encoding,
6453 const Py_UNICODE *unicode, Py_ssize_t size,
6454 Py_ssize_t startpos, Py_ssize_t endpos,
6455 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006456{
6457 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006458 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461}
6462
6463/* error handling callback helper:
6464 build arguments, call the callback and check the arguments,
6465 put the result into newpos and return the replacement string, which
6466 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006467static PyObject *
6468unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006469 PyObject **errorHandler,
6470 const char *encoding, const char *reason,
6471 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6472 Py_ssize_t startpos, Py_ssize_t endpos,
6473 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006474{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006475 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006476
6477 PyObject *restuple;
6478 PyObject *resunicode;
6479
6480 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006482 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484 }
6485
6486 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006487 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006488 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006490
6491 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006495 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006496 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 Py_DECREF(restuple);
6498 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006500 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 &resunicode, newpos)) {
6502 Py_DECREF(restuple);
6503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006505 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6506 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6507 Py_DECREF(restuple);
6508 return NULL;
6509 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006510 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006512 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6514 Py_DECREF(restuple);
6515 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006517 Py_INCREF(resunicode);
6518 Py_DECREF(restuple);
6519 return resunicode;
6520}
6521
Alexander Belopolsky40018472011-02-26 01:02:56 +00006522static PyObject *
6523unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006524 Py_ssize_t size,
6525 const char *errors,
6526 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527{
6528 /* output object */
6529 PyObject *res;
6530 /* pointers to the beginning and end+1 of input */
6531 const Py_UNICODE *startp = p;
6532 const Py_UNICODE *endp = p + size;
6533 /* pointer to the beginning of the unencodable characters */
6534 /* const Py_UNICODE *badp = NULL; */
6535 /* pointer into the output */
6536 char *str;
6537 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006538 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006539 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6540 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006541 PyObject *errorHandler = NULL;
6542 PyObject *exc = NULL;
6543 /* the following variable is used for caching string comparisons
6544 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6545 int known_errorHandler = -1;
6546
6547 /* allocate enough for a simple encoding without
6548 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006549 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006550 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006551 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006553 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006554 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006555 ressize = size;
6556
6557 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006559
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 /* can we encode this? */
6561 if (c<limit) {
6562 /* no overflow check, because we know that the space is enough */
6563 *str++ = (char)c;
6564 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006565 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006566 else {
6567 Py_ssize_t unicodepos = p-startp;
6568 Py_ssize_t requiredsize;
6569 PyObject *repunicode;
6570 Py_ssize_t repsize;
6571 Py_ssize_t newpos;
6572 Py_ssize_t respos;
6573 Py_UNICODE *uni2;
6574 /* startpos for collecting unencodable chars */
6575 const Py_UNICODE *collstart = p;
6576 const Py_UNICODE *collend = p;
6577 /* find all unecodable characters */
6578 while ((collend < endp) && ((*collend)>=limit))
6579 ++collend;
6580 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6581 if (known_errorHandler==-1) {
6582 if ((errors==NULL) || (!strcmp(errors, "strict")))
6583 known_errorHandler = 1;
6584 else if (!strcmp(errors, "replace"))
6585 known_errorHandler = 2;
6586 else if (!strcmp(errors, "ignore"))
6587 known_errorHandler = 3;
6588 else if (!strcmp(errors, "xmlcharrefreplace"))
6589 known_errorHandler = 4;
6590 else
6591 known_errorHandler = 0;
6592 }
6593 switch (known_errorHandler) {
6594 case 1: /* strict */
6595 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6596 goto onError;
6597 case 2: /* replace */
6598 while (collstart++<collend)
6599 *str++ = '?'; /* fall through */
6600 case 3: /* ignore */
6601 p = collend;
6602 break;
6603 case 4: /* xmlcharrefreplace */
6604 respos = str - PyBytes_AS_STRING(res);
6605 /* determine replacement size (temporarily (mis)uses p) */
6606 for (p = collstart, repsize = 0; p < collend; ++p) {
6607 if (*p<10)
6608 repsize += 2+1+1;
6609 else if (*p<100)
6610 repsize += 2+2+1;
6611 else if (*p<1000)
6612 repsize += 2+3+1;
6613 else if (*p<10000)
6614 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006615#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006616 else
6617 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006618#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006619 else if (*p<100000)
6620 repsize += 2+5+1;
6621 else if (*p<1000000)
6622 repsize += 2+6+1;
6623 else
6624 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006625#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 }
6627 requiredsize = respos+repsize+(endp-collend);
6628 if (requiredsize > ressize) {
6629 if (requiredsize<2*ressize)
6630 requiredsize = 2*ressize;
6631 if (_PyBytes_Resize(&res, requiredsize))
6632 goto onError;
6633 str = PyBytes_AS_STRING(res) + respos;
6634 ressize = requiredsize;
6635 }
6636 /* generate replacement (temporarily (mis)uses p) */
6637 for (p = collstart; p < collend; ++p) {
6638 str += sprintf(str, "&#%d;", (int)*p);
6639 }
6640 p = collend;
6641 break;
6642 default:
6643 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6644 encoding, reason, startp, size, &exc,
6645 collstart-startp, collend-startp, &newpos);
6646 if (repunicode == NULL)
6647 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006648 if (PyBytes_Check(repunicode)) {
6649 /* Directly copy bytes result to output. */
6650 repsize = PyBytes_Size(repunicode);
6651 if (repsize > 1) {
6652 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006653 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006654 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6655 Py_DECREF(repunicode);
6656 goto onError;
6657 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006658 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006659 ressize += repsize-1;
6660 }
6661 memcpy(str, PyBytes_AsString(repunicode), repsize);
6662 str += repsize;
6663 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006664 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006665 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006667 /* need more space? (at least enough for what we
6668 have+the replacement+the rest of the string, so
6669 we won't have to check space for encodable characters) */
6670 respos = str - PyBytes_AS_STRING(res);
6671 repsize = PyUnicode_GET_SIZE(repunicode);
6672 requiredsize = respos+repsize+(endp-collend);
6673 if (requiredsize > ressize) {
6674 if (requiredsize<2*ressize)
6675 requiredsize = 2*ressize;
6676 if (_PyBytes_Resize(&res, requiredsize)) {
6677 Py_DECREF(repunicode);
6678 goto onError;
6679 }
6680 str = PyBytes_AS_STRING(res) + respos;
6681 ressize = requiredsize;
6682 }
6683 /* check if there is anything unencodable in the replacement
6684 and copy it to the output */
6685 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6686 c = *uni2;
6687 if (c >= limit) {
6688 raise_encode_exception(&exc, encoding, startp, size,
6689 unicodepos, unicodepos+1, reason);
6690 Py_DECREF(repunicode);
6691 goto onError;
6692 }
6693 *str = (char)c;
6694 }
6695 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006696 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006697 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006698 }
6699 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006700 /* Resize if we allocated to much */
6701 size = str - PyBytes_AS_STRING(res);
6702 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006703 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006704 if (_PyBytes_Resize(&res, size) < 0)
6705 goto onError;
6706 }
6707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006708 Py_XDECREF(errorHandler);
6709 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006710 return res;
6711
6712 onError:
6713 Py_XDECREF(res);
6714 Py_XDECREF(errorHandler);
6715 Py_XDECREF(exc);
6716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717}
6718
Alexander Belopolsky40018472011-02-26 01:02:56 +00006719PyObject *
6720PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006721 Py_ssize_t size,
6722 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006724 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725}
6726
Alexander Belopolsky40018472011-02-26 01:02:56 +00006727PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006728_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729{
6730 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 PyErr_BadArgument();
6732 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006734 if (PyUnicode_READY(unicode) == -1)
6735 return NULL;
6736 /* Fast path: if it is a one-byte string, construct
6737 bytes object directly. */
6738 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6739 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6740 PyUnicode_GET_LENGTH(unicode));
6741 /* Non-Latin-1 characters present. Defer to above function to
6742 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006745 errors);
6746}
6747
6748PyObject*
6749PyUnicode_AsLatin1String(PyObject *unicode)
6750{
6751 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752}
6753
6754/* --- 7-bit ASCII Codec -------------------------------------------------- */
6755
Alexander Belopolsky40018472011-02-26 01:02:56 +00006756PyObject *
6757PyUnicode_DecodeASCII(const char *s,
6758 Py_ssize_t size,
6759 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006763 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006764 Py_ssize_t startinpos;
6765 Py_ssize_t endinpos;
6766 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006768 int has_error;
6769 const unsigned char *p = (const unsigned char *)s;
6770 const unsigned char *end = p + size;
6771 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006772 PyObject *errorHandler = NULL;
6773 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006774
Guido van Rossumd57fd912000-03-10 22:53:23 +00006775 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006776 if (size == 1 && (unsigned char)s[0] < 128)
6777 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006778
Victor Stinner702c7342011-10-05 13:50:52 +02006779 has_error = 0;
6780 while (p < end && !has_error) {
6781 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6782 an explanation. */
6783 if (!((size_t) p & LONG_PTR_MASK)) {
6784 /* Help register allocation */
6785 register const unsigned char *_p = p;
6786 while (_p < aligned_end) {
6787 unsigned long value = *(unsigned long *) _p;
6788 if (value & ASCII_CHAR_MASK) {
6789 has_error = 1;
6790 break;
6791 }
6792 _p += SIZEOF_LONG;
6793 }
6794 if (_p == end)
6795 break;
6796 if (has_error)
6797 break;
6798 p = _p;
6799 }
6800 if (*p & 0x80) {
6801 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006802 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006803 }
6804 else {
6805 ++p;
6806 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006807 }
Victor Stinner702c7342011-10-05 13:50:52 +02006808 if (!has_error)
6809 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006810
Guido van Rossumd57fd912000-03-10 22:53:23 +00006811 v = _PyUnicode_New(size);
6812 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006814 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006816 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006817 e = s + size;
6818 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 register unsigned char c = (unsigned char)*s;
6820 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006821 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 ++s;
6823 }
6824 else {
6825 startinpos = s-starts;
6826 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006827 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 if (unicode_decode_call_errorhandler(
6829 errors, &errorHandler,
6830 "ascii", "ordinal not in range(128)",
6831 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006832 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 goto onError;
6834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 }
Victor Stinner702c7342011-10-05 13:50:52 +02006836 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6837 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006839 Py_XDECREF(errorHandler);
6840 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006841#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006842 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006843 Py_DECREF(v);
6844 return NULL;
6845 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006846#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006847 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006848 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006849
Benjamin Peterson29060642009-01-31 22:14:21 +00006850 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006851 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852 Py_XDECREF(errorHandler);
6853 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006854 return NULL;
6855}
6856
Alexander Belopolsky40018472011-02-26 01:02:56 +00006857PyObject *
6858PyUnicode_EncodeASCII(const Py_UNICODE *p,
6859 Py_ssize_t size,
6860 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863}
6864
Alexander Belopolsky40018472011-02-26 01:02:56 +00006865PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006866_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867{
6868 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 PyErr_BadArgument();
6870 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006872 if (PyUnicode_READY(unicode) == -1)
6873 return NULL;
6874 /* Fast path: if it is an ASCII-only string, construct bytes object
6875 directly. Else defer to above function to raise the exception. */
6876 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6877 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6878 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006880 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006881 errors);
6882}
6883
6884PyObject *
6885PyUnicode_AsASCIIString(PyObject *unicode)
6886{
6887 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888}
6889
Victor Stinner99b95382011-07-04 14:23:54 +02006890#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006891
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006892/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006893
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006894#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895#define NEED_RETRY
6896#endif
6897
6898/* XXX This code is limited to "true" double-byte encodings, as
6899 a) it assumes an incomplete character consists of a single byte, and
6900 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006902
Alexander Belopolsky40018472011-02-26 01:02:56 +00006903static int
6904is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905{
6906 const char *curr = s + offset;
6907
6908 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 const char *prev = CharPrev(s, curr);
6910 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911 }
6912 return 0;
6913}
6914
6915/*
6916 * Decode MBCS string into unicode object. If 'final' is set, converts
6917 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6918 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006919static int
6920decode_mbcs(PyUnicodeObject **v,
6921 const char *s, /* MBCS string */
6922 int size, /* sizeof MBCS string */
6923 int final,
6924 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006925{
6926 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006927 Py_ssize_t n;
6928 DWORD usize;
6929 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006930
6931 assert(size >= 0);
6932
Victor Stinner554f3f02010-06-16 23:33:54 +00006933 /* check and handle 'errors' arg */
6934 if (errors==NULL || strcmp(errors, "strict")==0)
6935 flags = MB_ERR_INVALID_CHARS;
6936 else if (strcmp(errors, "ignore")==0)
6937 flags = 0;
6938 else {
6939 PyErr_Format(PyExc_ValueError,
6940 "mbcs encoding does not support errors='%s'",
6941 errors);
6942 return -1;
6943 }
6944
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006945 /* Skip trailing lead-byte unless 'final' is set */
6946 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006947 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948
6949 /* First get the size of the result */
6950 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006951 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6952 if (usize==0)
6953 goto mbcs_decode_error;
6954 } else
6955 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006956
6957 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 /* Create unicode object */
6959 *v = _PyUnicode_New(usize);
6960 if (*v == NULL)
6961 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006962 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006963 }
6964 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 /* Extend unicode object */
6966 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006967 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006968 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006969 }
6970
6971 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006972 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006974 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6975 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006976 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006977 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006979
6980mbcs_decode_error:
6981 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6982 we raise a UnicodeDecodeError - else it is a 'generic'
6983 windows error
6984 */
6985 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6986 /* Ideally, we should get reason from FormatMessage - this
6987 is the Windows 2000 English version of the message
6988 */
6989 PyObject *exc = NULL;
6990 const char *reason = "No mapping for the Unicode character exists "
6991 "in the target multi-byte code page.";
6992 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6993 if (exc != NULL) {
6994 PyCodec_StrictErrors(exc);
6995 Py_DECREF(exc);
6996 }
6997 } else {
6998 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6999 }
7000 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007001}
7002
Alexander Belopolsky40018472011-02-26 01:02:56 +00007003PyObject *
7004PyUnicode_DecodeMBCSStateful(const char *s,
7005 Py_ssize_t size,
7006 const char *errors,
7007 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008{
7009 PyUnicodeObject *v = NULL;
7010 int done;
7011
7012 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014
7015#ifdef NEED_RETRY
7016 retry:
7017 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007018 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007019 else
7020#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007021 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007022
7023 if (done < 0) {
7024 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026 }
7027
7028 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
7031#ifdef NEED_RETRY
7032 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 s += done;
7034 size -= done;
7035 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007036 }
7037#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02007038#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007039 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040 Py_DECREF(v);
7041 return NULL;
7042 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007043#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007044 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045 return (PyObject *)v;
7046}
7047
Alexander Belopolsky40018472011-02-26 01:02:56 +00007048PyObject *
7049PyUnicode_DecodeMBCS(const char *s,
7050 Py_ssize_t size,
7051 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007052{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7054}
7055
7056/*
7057 * Convert unicode into string object (MBCS).
7058 * Returns 0 if succeed, -1 otherwise.
7059 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007060static int
7061encode_mbcs(PyObject **repr,
7062 const Py_UNICODE *p, /* unicode */
7063 int size, /* size of unicode */
7064 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007065{
Victor Stinner554f3f02010-06-16 23:33:54 +00007066 BOOL usedDefaultChar = FALSE;
7067 BOOL *pusedDefaultChar;
7068 int mbcssize;
7069 Py_ssize_t n;
7070 PyObject *exc = NULL;
7071 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007072
7073 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007074
Victor Stinner554f3f02010-06-16 23:33:54 +00007075 /* check and handle 'errors' arg */
7076 if (errors==NULL || strcmp(errors, "strict")==0) {
7077 flags = WC_NO_BEST_FIT_CHARS;
7078 pusedDefaultChar = &usedDefaultChar;
7079 } else if (strcmp(errors, "replace")==0) {
7080 flags = 0;
7081 pusedDefaultChar = NULL;
7082 } else {
7083 PyErr_Format(PyExc_ValueError,
7084 "mbcs encoding does not support errors='%s'",
7085 errors);
7086 return -1;
7087 }
7088
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007089 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007090 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007091 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7092 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007093 if (mbcssize == 0) {
7094 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7095 return -1;
7096 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007097 /* If we used a default char, then we failed! */
7098 if (pusedDefaultChar && *pusedDefaultChar)
7099 goto mbcs_encode_error;
7100 } else {
7101 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007102 }
7103
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007104 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 /* Create string object */
7106 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7107 if (*repr == NULL)
7108 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007109 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110 }
7111 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007112 /* Extend string object */
7113 n = PyBytes_Size(*repr);
7114 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7115 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007116 }
7117
7118 /* Do the conversion */
7119 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007120 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007121 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7122 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7124 return -1;
7125 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007126 if (pusedDefaultChar && *pusedDefaultChar)
7127 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007128 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007129 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007130
7131mbcs_encode_error:
7132 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7133 Py_XDECREF(exc);
7134 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007135}
7136
Alexander Belopolsky40018472011-02-26 01:02:56 +00007137PyObject *
7138PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7139 Py_ssize_t size,
7140 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007141{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007142 PyObject *repr = NULL;
7143 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007144
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007145#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007146 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007148 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149 else
7150#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007151 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007152
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 Py_XDECREF(repr);
7155 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007156 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157
7158#ifdef NEED_RETRY
7159 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007160 p += INT_MAX;
7161 size -= INT_MAX;
7162 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007163 }
7164#endif
7165
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007166 return repr;
7167}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007168
Alexander Belopolsky40018472011-02-26 01:02:56 +00007169PyObject *
7170PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007171{
7172 if (!PyUnicode_Check(unicode)) {
7173 PyErr_BadArgument();
7174 return NULL;
7175 }
7176 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 PyUnicode_GET_SIZE(unicode),
7178 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007179}
7180
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007181#undef NEED_RETRY
7182
Victor Stinner99b95382011-07-04 14:23:54 +02007183#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007184
Guido van Rossumd57fd912000-03-10 22:53:23 +00007185/* --- Character Mapping Codec -------------------------------------------- */
7186
Alexander Belopolsky40018472011-02-26 01:02:56 +00007187PyObject *
7188PyUnicode_DecodeCharmap(const char *s,
7189 Py_ssize_t size,
7190 PyObject *mapping,
7191 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007192{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007193 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007194 Py_ssize_t startinpos;
7195 Py_ssize_t endinpos;
7196 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007197 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 PyUnicodeObject *v;
7199 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007200 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007201 PyObject *errorHandler = NULL;
7202 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007203 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007204 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007205
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 /* Default to Latin-1 */
7207 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007208 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007209
7210 v = _PyUnicode_New(size);
7211 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007213 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007214 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007216 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007217 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 mapstring = PyUnicode_AS_UNICODE(mapping);
7219 maplen = PyUnicode_GET_SIZE(mapping);
7220 while (s < e) {
7221 unsigned char ch = *s;
7222 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007223
Benjamin Peterson29060642009-01-31 22:14:21 +00007224 if (ch < maplen)
7225 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007226
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 if (x == 0xfffe) {
7228 /* undefined mapping */
7229 outpos = p-PyUnicode_AS_UNICODE(v);
7230 startinpos = s-starts;
7231 endinpos = startinpos+1;
7232 if (unicode_decode_call_errorhandler(
7233 errors, &errorHandler,
7234 "charmap", "character maps to <undefined>",
7235 &starts, &e, &startinpos, &endinpos, &exc, &s,
7236 &v, &outpos, &p)) {
7237 goto onError;
7238 }
7239 continue;
7240 }
7241 *p++ = x;
7242 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007243 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007244 }
7245 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007246 while (s < e) {
7247 unsigned char ch = *s;
7248 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007249
Benjamin Peterson29060642009-01-31 22:14:21 +00007250 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7251 w = PyLong_FromLong((long)ch);
7252 if (w == NULL)
7253 goto onError;
7254 x = PyObject_GetItem(mapping, w);
7255 Py_DECREF(w);
7256 if (x == NULL) {
7257 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7258 /* No mapping found means: mapping is undefined. */
7259 PyErr_Clear();
7260 x = Py_None;
7261 Py_INCREF(x);
7262 } else
7263 goto onError;
7264 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007265
Benjamin Peterson29060642009-01-31 22:14:21 +00007266 /* Apply mapping */
7267 if (PyLong_Check(x)) {
7268 long value = PyLong_AS_LONG(x);
7269 if (value < 0 || value > 65535) {
7270 PyErr_SetString(PyExc_TypeError,
7271 "character mapping must be in range(65536)");
7272 Py_DECREF(x);
7273 goto onError;
7274 }
7275 *p++ = (Py_UNICODE)value;
7276 }
7277 else if (x == Py_None) {
7278 /* undefined mapping */
7279 outpos = p-PyUnicode_AS_UNICODE(v);
7280 startinpos = s-starts;
7281 endinpos = startinpos+1;
7282 if (unicode_decode_call_errorhandler(
7283 errors, &errorHandler,
7284 "charmap", "character maps to <undefined>",
7285 &starts, &e, &startinpos, &endinpos, &exc, &s,
7286 &v, &outpos, &p)) {
7287 Py_DECREF(x);
7288 goto onError;
7289 }
7290 Py_DECREF(x);
7291 continue;
7292 }
7293 else if (PyUnicode_Check(x)) {
7294 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007295
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 if (targetsize == 1)
7297 /* 1-1 mapping */
7298 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007299
Benjamin Peterson29060642009-01-31 22:14:21 +00007300 else if (targetsize > 1) {
7301 /* 1-n mapping */
7302 if (targetsize > extrachars) {
7303 /* resize first */
7304 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7305 Py_ssize_t needed = (targetsize - extrachars) + \
7306 (targetsize << 2);
7307 extrachars += needed;
7308 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007309 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007310 PyUnicode_GET_SIZE(v) + needed) < 0) {
7311 Py_DECREF(x);
7312 goto onError;
7313 }
7314 p = PyUnicode_AS_UNICODE(v) + oldpos;
7315 }
7316 Py_UNICODE_COPY(p,
7317 PyUnicode_AS_UNICODE(x),
7318 targetsize);
7319 p += targetsize;
7320 extrachars -= targetsize;
7321 }
7322 /* 1-0 mapping: skip the character */
7323 }
7324 else {
7325 /* wrong return value */
7326 PyErr_SetString(PyExc_TypeError,
7327 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007328 Py_DECREF(x);
7329 goto onError;
7330 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007331 Py_DECREF(x);
7332 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007334 }
7335 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007336 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007337 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338 Py_XDECREF(errorHandler);
7339 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007340#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007341 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007342 Py_DECREF(v);
7343 return NULL;
7344 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007345#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007346 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007347 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007348
Benjamin Peterson29060642009-01-31 22:14:21 +00007349 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007350 Py_XDECREF(errorHandler);
7351 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 Py_XDECREF(v);
7353 return NULL;
7354}
7355
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007356/* Charmap encoding: the lookup table */
7357
Alexander Belopolsky40018472011-02-26 01:02:56 +00007358struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 PyObject_HEAD
7360 unsigned char level1[32];
7361 int count2, count3;
7362 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007363};
7364
7365static PyObject*
7366encoding_map_size(PyObject *obj, PyObject* args)
7367{
7368 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007369 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007371}
7372
7373static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007374 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 PyDoc_STR("Return the size (in bytes) of this object") },
7376 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007377};
7378
7379static void
7380encoding_map_dealloc(PyObject* o)
7381{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007382 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007383}
7384
7385static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007386 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007387 "EncodingMap", /*tp_name*/
7388 sizeof(struct encoding_map), /*tp_basicsize*/
7389 0, /*tp_itemsize*/
7390 /* methods */
7391 encoding_map_dealloc, /*tp_dealloc*/
7392 0, /*tp_print*/
7393 0, /*tp_getattr*/
7394 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007395 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 0, /*tp_repr*/
7397 0, /*tp_as_number*/
7398 0, /*tp_as_sequence*/
7399 0, /*tp_as_mapping*/
7400 0, /*tp_hash*/
7401 0, /*tp_call*/
7402 0, /*tp_str*/
7403 0, /*tp_getattro*/
7404 0, /*tp_setattro*/
7405 0, /*tp_as_buffer*/
7406 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7407 0, /*tp_doc*/
7408 0, /*tp_traverse*/
7409 0, /*tp_clear*/
7410 0, /*tp_richcompare*/
7411 0, /*tp_weaklistoffset*/
7412 0, /*tp_iter*/
7413 0, /*tp_iternext*/
7414 encoding_map_methods, /*tp_methods*/
7415 0, /*tp_members*/
7416 0, /*tp_getset*/
7417 0, /*tp_base*/
7418 0, /*tp_dict*/
7419 0, /*tp_descr_get*/
7420 0, /*tp_descr_set*/
7421 0, /*tp_dictoffset*/
7422 0, /*tp_init*/
7423 0, /*tp_alloc*/
7424 0, /*tp_new*/
7425 0, /*tp_free*/
7426 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007427};
7428
7429PyObject*
7430PyUnicode_BuildEncodingMap(PyObject* string)
7431{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007432 PyObject *result;
7433 struct encoding_map *mresult;
7434 int i;
7435 int need_dict = 0;
7436 unsigned char level1[32];
7437 unsigned char level2[512];
7438 unsigned char *mlevel1, *mlevel2, *mlevel3;
7439 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007440 int kind;
7441 void *data;
7442 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007444 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007445 PyErr_BadArgument();
7446 return NULL;
7447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007448 kind = PyUnicode_KIND(string);
7449 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007450 memset(level1, 0xFF, sizeof level1);
7451 memset(level2, 0xFF, sizeof level2);
7452
7453 /* If there isn't a one-to-one mapping of NULL to \0,
7454 or if there are non-BMP characters, we need to use
7455 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007456 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007457 need_dict = 1;
7458 for (i = 1; i < 256; i++) {
7459 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007460 ch = PyUnicode_READ(kind, data, i);
7461 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007462 need_dict = 1;
7463 break;
7464 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007466 /* unmapped character */
7467 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007468 l1 = ch >> 11;
7469 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007470 if (level1[l1] == 0xFF)
7471 level1[l1] = count2++;
7472 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007473 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007474 }
7475
7476 if (count2 >= 0xFF || count3 >= 0xFF)
7477 need_dict = 1;
7478
7479 if (need_dict) {
7480 PyObject *result = PyDict_New();
7481 PyObject *key, *value;
7482 if (!result)
7483 return NULL;
7484 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007485 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007486 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007487 if (!key || !value)
7488 goto failed1;
7489 if (PyDict_SetItem(result, key, value) == -1)
7490 goto failed1;
7491 Py_DECREF(key);
7492 Py_DECREF(value);
7493 }
7494 return result;
7495 failed1:
7496 Py_XDECREF(key);
7497 Py_XDECREF(value);
7498 Py_DECREF(result);
7499 return NULL;
7500 }
7501
7502 /* Create a three-level trie */
7503 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7504 16*count2 + 128*count3 - 1);
7505 if (!result)
7506 return PyErr_NoMemory();
7507 PyObject_Init(result, &EncodingMapType);
7508 mresult = (struct encoding_map*)result;
7509 mresult->count2 = count2;
7510 mresult->count3 = count3;
7511 mlevel1 = mresult->level1;
7512 mlevel2 = mresult->level23;
7513 mlevel3 = mresult->level23 + 16*count2;
7514 memcpy(mlevel1, level1, 32);
7515 memset(mlevel2, 0xFF, 16*count2);
7516 memset(mlevel3, 0, 128*count3);
7517 count3 = 0;
7518 for (i = 1; i < 256; i++) {
7519 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007520 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007521 /* unmapped character */
7522 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007523 o1 = PyUnicode_READ(kind, data, i)>>11;
7524 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007525 i2 = 16*mlevel1[o1] + o2;
7526 if (mlevel2[i2] == 0xFF)
7527 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007528 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007529 i3 = 128*mlevel2[i2] + o3;
7530 mlevel3[i3] = i;
7531 }
7532 return result;
7533}
7534
7535static int
7536encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7537{
7538 struct encoding_map *map = (struct encoding_map*)mapping;
7539 int l1 = c>>11;
7540 int l2 = (c>>7) & 0xF;
7541 int l3 = c & 0x7F;
7542 int i;
7543
7544#ifdef Py_UNICODE_WIDE
7545 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007546 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007547 }
7548#endif
7549 if (c == 0)
7550 return 0;
7551 /* level 1*/
7552 i = map->level1[l1];
7553 if (i == 0xFF) {
7554 return -1;
7555 }
7556 /* level 2*/
7557 i = map->level23[16*i+l2];
7558 if (i == 0xFF) {
7559 return -1;
7560 }
7561 /* level 3 */
7562 i = map->level23[16*map->count2 + 128*i + l3];
7563 if (i == 0) {
7564 return -1;
7565 }
7566 return i;
7567}
7568
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007569/* Lookup the character ch in the mapping. If the character
7570 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007571 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007572static PyObject *
7573charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007574{
Christian Heimes217cfd12007-12-02 14:31:20 +00007575 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007576 PyObject *x;
7577
7578 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007579 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007580 x = PyObject_GetItem(mapping, w);
7581 Py_DECREF(w);
7582 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007583 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7584 /* No mapping found means: mapping is undefined. */
7585 PyErr_Clear();
7586 x = Py_None;
7587 Py_INCREF(x);
7588 return x;
7589 } else
7590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007591 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007592 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007594 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 long value = PyLong_AS_LONG(x);
7596 if (value < 0 || value > 255) {
7597 PyErr_SetString(PyExc_TypeError,
7598 "character mapping must be in range(256)");
7599 Py_DECREF(x);
7600 return NULL;
7601 }
7602 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007603 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007604 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007605 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007606 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007607 /* wrong return value */
7608 PyErr_Format(PyExc_TypeError,
7609 "character mapping must return integer, bytes or None, not %.400s",
7610 x->ob_type->tp_name);
7611 Py_DECREF(x);
7612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007613 }
7614}
7615
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007616static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007617charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007618{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007619 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7620 /* exponentially overallocate to minimize reallocations */
7621 if (requiredsize < 2*outsize)
7622 requiredsize = 2*outsize;
7623 if (_PyBytes_Resize(outobj, requiredsize))
7624 return -1;
7625 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007626}
7627
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007629 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007630} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007631/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007632 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007633 space is available. Return a new reference to the object that
7634 was put in the output buffer, or Py_None, if the mapping was undefined
7635 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007636 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007637static charmapencode_result
7638charmapencode_output(Py_UNICODE c, PyObject *mapping,
7639 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007641 PyObject *rep;
7642 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007643 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007644
Christian Heimes90aa7642007-12-19 02:45:37 +00007645 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007646 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007647 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007648 if (res == -1)
7649 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007650 if (outsize<requiredsize)
7651 if (charmapencode_resize(outobj, outpos, requiredsize))
7652 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007653 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007654 outstart[(*outpos)++] = (char)res;
7655 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007656 }
7657
7658 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007659 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007660 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007661 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007662 Py_DECREF(rep);
7663 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007664 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007665 if (PyLong_Check(rep)) {
7666 Py_ssize_t requiredsize = *outpos+1;
7667 if (outsize<requiredsize)
7668 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7669 Py_DECREF(rep);
7670 return enc_EXCEPTION;
7671 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007672 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007673 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007674 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007675 else {
7676 const char *repchars = PyBytes_AS_STRING(rep);
7677 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7678 Py_ssize_t requiredsize = *outpos+repsize;
7679 if (outsize<requiredsize)
7680 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7681 Py_DECREF(rep);
7682 return enc_EXCEPTION;
7683 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007684 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007685 memcpy(outstart + *outpos, repchars, repsize);
7686 *outpos += repsize;
7687 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007688 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007689 Py_DECREF(rep);
7690 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007691}
7692
7693/* handle an error in PyUnicode_EncodeCharmap
7694 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007695static int
7696charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007697 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007698 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007699 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007700 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007701{
7702 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007703 Py_ssize_t repsize;
7704 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007705 Py_UNICODE *uni2;
7706 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007707 Py_ssize_t collstartpos = *inpos;
7708 Py_ssize_t collendpos = *inpos+1;
7709 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710 char *encoding = "charmap";
7711 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007712 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714 /* find all unencodable characters */
7715 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007716 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007717 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007718 int res = encoding_map_lookup(p[collendpos], mapping);
7719 if (res != -1)
7720 break;
7721 ++collendpos;
7722 continue;
7723 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007724
Benjamin Peterson29060642009-01-31 22:14:21 +00007725 rep = charmapencode_lookup(p[collendpos], mapping);
7726 if (rep==NULL)
7727 return -1;
7728 else if (rep!=Py_None) {
7729 Py_DECREF(rep);
7730 break;
7731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007732 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007733 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007734 }
7735 /* cache callback name lookup
7736 * (if not done yet, i.e. it's the first error) */
7737 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007738 if ((errors==NULL) || (!strcmp(errors, "strict")))
7739 *known_errorHandler = 1;
7740 else if (!strcmp(errors, "replace"))
7741 *known_errorHandler = 2;
7742 else if (!strcmp(errors, "ignore"))
7743 *known_errorHandler = 3;
7744 else if (!strcmp(errors, "xmlcharrefreplace"))
7745 *known_errorHandler = 4;
7746 else
7747 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007748 }
7749 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 case 1: /* strict */
7751 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7752 return -1;
7753 case 2: /* replace */
7754 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007755 x = charmapencode_output('?', mapping, res, respos);
7756 if (x==enc_EXCEPTION) {
7757 return -1;
7758 }
7759 else if (x==enc_FAILED) {
7760 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7761 return -1;
7762 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763 }
7764 /* fall through */
7765 case 3: /* ignore */
7766 *inpos = collendpos;
7767 break;
7768 case 4: /* xmlcharrefreplace */
7769 /* generate replacement (temporarily (mis)uses p) */
7770 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 char buffer[2+29+1+1];
7772 char *cp;
7773 sprintf(buffer, "&#%d;", (int)p[collpos]);
7774 for (cp = buffer; *cp; ++cp) {
7775 x = charmapencode_output(*cp, mapping, res, respos);
7776 if (x==enc_EXCEPTION)
7777 return -1;
7778 else if (x==enc_FAILED) {
7779 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7780 return -1;
7781 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007782 }
7783 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007784 *inpos = collendpos;
7785 break;
7786 default:
7787 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007788 encoding, reason, p, size, exceptionObject,
7789 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007790 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007792 if (PyBytes_Check(repunicode)) {
7793 /* Directly copy bytes result to output. */
7794 Py_ssize_t outsize = PyBytes_Size(*res);
7795 Py_ssize_t requiredsize;
7796 repsize = PyBytes_Size(repunicode);
7797 requiredsize = *respos + repsize;
7798 if (requiredsize > outsize)
7799 /* Make room for all additional bytes. */
7800 if (charmapencode_resize(res, respos, requiredsize)) {
7801 Py_DECREF(repunicode);
7802 return -1;
7803 }
7804 memcpy(PyBytes_AsString(*res) + *respos,
7805 PyBytes_AsString(repunicode), repsize);
7806 *respos += repsize;
7807 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007808 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007809 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007810 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007811 /* generate replacement */
7812 repsize = PyUnicode_GET_SIZE(repunicode);
7813 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007814 x = charmapencode_output(*uni2, mapping, res, respos);
7815 if (x==enc_EXCEPTION) {
7816 return -1;
7817 }
7818 else if (x==enc_FAILED) {
7819 Py_DECREF(repunicode);
7820 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7821 return -1;
7822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007823 }
7824 *inpos = newpos;
7825 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007826 }
7827 return 0;
7828}
7829
Alexander Belopolsky40018472011-02-26 01:02:56 +00007830PyObject *
7831PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7832 Py_ssize_t size,
7833 PyObject *mapping,
7834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007835{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007836 /* output object */
7837 PyObject *res = NULL;
7838 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007839 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007840 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007841 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007842 PyObject *errorHandler = NULL;
7843 PyObject *exc = NULL;
7844 /* the following variable is used for caching string comparisons
7845 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7846 * 3=ignore, 4=xmlcharrefreplace */
7847 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007848
7849 /* Default to Latin-1 */
7850 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007853 /* allocate enough for a simple encoding without
7854 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007855 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007856 if (res == NULL)
7857 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007858 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007859 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007861 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007862 /* try to encode it */
7863 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7864 if (x==enc_EXCEPTION) /* error */
7865 goto onError;
7866 if (x==enc_FAILED) { /* unencodable character */
7867 if (charmap_encoding_error(p, size, &inpos, mapping,
7868 &exc,
7869 &known_errorHandler, &errorHandler, errors,
7870 &res, &respos)) {
7871 goto onError;
7872 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007873 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007874 else
7875 /* done with this character => adjust input position */
7876 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007877 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007878
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007879 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007880 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007881 if (_PyBytes_Resize(&res, respos) < 0)
7882 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007883
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007884 Py_XDECREF(exc);
7885 Py_XDECREF(errorHandler);
7886 return res;
7887
Benjamin Peterson29060642009-01-31 22:14:21 +00007888 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889 Py_XDECREF(res);
7890 Py_XDECREF(exc);
7891 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007892 return NULL;
7893}
7894
Alexander Belopolsky40018472011-02-26 01:02:56 +00007895PyObject *
7896PyUnicode_AsCharmapString(PyObject *unicode,
7897 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007898{
7899 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 PyErr_BadArgument();
7901 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007902 }
7903 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007904 PyUnicode_GET_SIZE(unicode),
7905 mapping,
7906 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907}
7908
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007909/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007910static void
7911make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007913 Py_ssize_t startpos, Py_ssize_t endpos,
7914 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007915{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007916 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 *exceptionObject = _PyUnicodeTranslateError_Create(
7918 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007919 }
7920 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7922 goto onError;
7923 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7924 goto onError;
7925 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7926 goto onError;
7927 return;
7928 onError:
7929 Py_DECREF(*exceptionObject);
7930 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007931 }
7932}
7933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007934/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007935static void
7936raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007937 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007938 Py_ssize_t startpos, Py_ssize_t endpos,
7939 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007940{
7941 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007942 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007945}
7946
7947/* error handling callback helper:
7948 build arguments, call the callback and check the arguments,
7949 put the result into newpos and return the replacement string, which
7950 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007951static PyObject *
7952unicode_translate_call_errorhandler(const char *errors,
7953 PyObject **errorHandler,
7954 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007955 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007956 Py_ssize_t startpos, Py_ssize_t endpos,
7957 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007959 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007960
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007961 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962 PyObject *restuple;
7963 PyObject *resunicode;
7964
7965 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 }
7970
7971 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007972 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007973 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007974 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007975
7976 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007979 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007980 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007981 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007982 Py_DECREF(restuple);
7983 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984 }
7985 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 &resunicode, &i_newpos)) {
7987 Py_DECREF(restuple);
7988 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007990 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007991 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007992 else
7993 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007994 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7996 Py_DECREF(restuple);
7997 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007998 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007999 Py_INCREF(resunicode);
8000 Py_DECREF(restuple);
8001 return resunicode;
8002}
8003
8004/* Lookup the character ch in the mapping and put the result in result,
8005 which must be decrefed by the caller.
8006 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008007static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008008charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009{
Christian Heimes217cfd12007-12-02 14:31:20 +00008010 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 PyObject *x;
8012
8013 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015 x = PyObject_GetItem(mapping, w);
8016 Py_DECREF(w);
8017 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008018 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8019 /* No mapping found means: use 1:1 mapping. */
8020 PyErr_Clear();
8021 *result = NULL;
8022 return 0;
8023 } else
8024 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008025 }
8026 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 *result = x;
8028 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008029 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008030 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008031 long value = PyLong_AS_LONG(x);
8032 long max = PyUnicode_GetMax();
8033 if (value < 0 || value > max) {
8034 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008035 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 Py_DECREF(x);
8037 return -1;
8038 }
8039 *result = x;
8040 return 0;
8041 }
8042 else if (PyUnicode_Check(x)) {
8043 *result = x;
8044 return 0;
8045 }
8046 else {
8047 /* wrong return value */
8048 PyErr_SetString(PyExc_TypeError,
8049 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008050 Py_DECREF(x);
8051 return -1;
8052 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053}
8054/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 if not reallocate and adjust various state variables.
8056 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008057static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008058charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008061 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008062 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 /* exponentially overallocate to minimize reallocations */
8064 if (requiredsize < 2 * oldsize)
8065 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008066 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8067 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008069 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 }
8071 return 0;
8072}
8073/* lookup the character, put the result in the output string and adjust
8074 various state variables. Return a new reference to the object that
8075 was put in the output buffer in *result, or Py_None, if the mapping was
8076 undefined (in which case no character was written).
8077 The called must decref result.
8078 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008079static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008080charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8081 PyObject *mapping, Py_UCS4 **output,
8082 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008083 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8086 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008087 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008088 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008090 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008091 }
8092 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008094 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008096 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 }
8098 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008099 Py_ssize_t repsize;
8100 if (PyUnicode_READY(*res) == -1)
8101 return -1;
8102 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 if (repsize==1) {
8104 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 }
8107 else if (repsize!=0) {
8108 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008109 Py_ssize_t requiredsize = *opos +
8110 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008112 Py_ssize_t i;
8113 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008115 for(i = 0; i < repsize; i++)
8116 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 }
8119 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 return 0;
8122}
8123
Alexander Belopolsky40018472011-02-26 01:02:56 +00008124PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008125_PyUnicode_TranslateCharmap(PyObject *input,
8126 PyObject *mapping,
8127 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008129 /* input object */
8130 char *idata;
8131 Py_ssize_t size, i;
8132 int kind;
8133 /* output buffer */
8134 Py_UCS4 *output = NULL;
8135 Py_ssize_t osize;
8136 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008139 char *reason = "character maps to <undefined>";
8140 PyObject *errorHandler = NULL;
8141 PyObject *exc = NULL;
8142 /* the following variable is used for caching string comparisons
8143 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8144 * 3=ignore, 4=xmlcharrefreplace */
8145 int known_errorHandler = -1;
8146
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 PyErr_BadArgument();
8149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008150 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008152 if (PyUnicode_READY(input) == -1)
8153 return NULL;
8154 idata = (char*)PyUnicode_DATA(input);
8155 kind = PyUnicode_KIND(input);
8156 size = PyUnicode_GET_LENGTH(input);
8157 i = 0;
8158
8159 if (size == 0) {
8160 Py_INCREF(input);
8161 return input;
8162 }
8163
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 /* allocate enough for a simple 1:1 translation without
8165 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 osize = size;
8167 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8168 opos = 0;
8169 if (output == NULL) {
8170 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008174 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008175 /* try to encode it */
8176 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177 if (charmaptranslate_output(input, i, mapping,
8178 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 Py_XDECREF(x);
8180 goto onError;
8181 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008183 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 else { /* untranslatable character */
8186 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8187 Py_ssize_t repsize;
8188 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 Py_ssize_t collstart = i;
8192 Py_ssize_t collend = i+1;
8193 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008194
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 while (collend < size) {
8197 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 goto onError;
8199 Py_XDECREF(x);
8200 if (x!=Py_None)
8201 break;
8202 ++collend;
8203 }
8204 /* cache callback name lookup
8205 * (if not done yet, i.e. it's the first error) */
8206 if (known_errorHandler==-1) {
8207 if ((errors==NULL) || (!strcmp(errors, "strict")))
8208 known_errorHandler = 1;
8209 else if (!strcmp(errors, "replace"))
8210 known_errorHandler = 2;
8211 else if (!strcmp(errors, "ignore"))
8212 known_errorHandler = 3;
8213 else if (!strcmp(errors, "xmlcharrefreplace"))
8214 known_errorHandler = 4;
8215 else
8216 known_errorHandler = 0;
8217 }
8218 switch (known_errorHandler) {
8219 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008220 raise_translate_exception(&exc, input, collstart,
8221 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008223 case 2: /* replace */
8224 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008225 for (coll = collstart; coll<collend; coll++)
8226 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 /* fall through */
8228 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008230 break;
8231 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008232 /* generate replacement (temporarily (mis)uses i) */
8233 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 char buffer[2+29+1+1];
8235 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008236 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8237 if (charmaptranslate_makespace(&output, &osize,
8238 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 goto onError;
8240 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008242 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008243 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008244 break;
8245 default:
8246 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 reason, input, &exc,
8248 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008249 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008250 goto onError;
8251 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 repsize = PyUnicode_GET_LENGTH(repunicode);
8253 if (charmaptranslate_makespace(&output, &osize,
8254 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 Py_DECREF(repunicode);
8256 goto onError;
8257 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008258 for (uni2 = 0; repsize-->0; ++uni2)
8259 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8260 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008261 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008262 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008263 }
8264 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008265 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8266 if (!res)
8267 goto onError;
8268 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008269 Py_XDECREF(exc);
8270 Py_XDECREF(errorHandler);
8271 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272
Benjamin Peterson29060642009-01-31 22:14:21 +00008273 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008275 Py_XDECREF(exc);
8276 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008277 return NULL;
8278}
8279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008280/* Deprecated. Use PyUnicode_Translate instead. */
8281PyObject *
8282PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8283 Py_ssize_t size,
8284 PyObject *mapping,
8285 const char *errors)
8286{
8287 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8288 if (!unicode)
8289 return NULL;
8290 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8291}
8292
Alexander Belopolsky40018472011-02-26 01:02:56 +00008293PyObject *
8294PyUnicode_Translate(PyObject *str,
8295 PyObject *mapping,
8296 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297{
8298 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008299
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 str = PyUnicode_FromObject(str);
8301 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008302 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008303 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008304 Py_DECREF(str);
8305 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008306
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008308 Py_XDECREF(str);
8309 return NULL;
8310}
Tim Petersced69f82003-09-16 20:30:58 +00008311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008313fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008314{
8315 /* No need to call PyUnicode_READY(self) because this function is only
8316 called as a callback from fixup() which does it already. */
8317 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8318 const int kind = PyUnicode_KIND(self);
8319 void *data = PyUnicode_DATA(self);
8320 Py_UCS4 maxchar = 0, ch, fixed;
8321 Py_ssize_t i;
8322
8323 for (i = 0; i < len; ++i) {
8324 ch = PyUnicode_READ(kind, data, i);
8325 fixed = 0;
8326 if (ch > 127) {
8327 if (Py_UNICODE_ISSPACE(ch))
8328 fixed = ' ';
8329 else {
8330 const int decimal = Py_UNICODE_TODECIMAL(ch);
8331 if (decimal >= 0)
8332 fixed = '0' + decimal;
8333 }
8334 if (fixed != 0) {
8335 if (fixed > maxchar)
8336 maxchar = fixed;
8337 PyUnicode_WRITE(kind, data, i, fixed);
8338 }
8339 else if (ch > maxchar)
8340 maxchar = ch;
8341 }
8342 else if (ch > maxchar)
8343 maxchar = ch;
8344 }
8345
8346 return maxchar;
8347}
8348
8349PyObject *
8350_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8351{
8352 if (!PyUnicode_Check(unicode)) {
8353 PyErr_BadInternalCall();
8354 return NULL;
8355 }
8356 if (PyUnicode_READY(unicode) == -1)
8357 return NULL;
8358 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8359 /* If the string is already ASCII, just return the same string */
8360 Py_INCREF(unicode);
8361 return unicode;
8362 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008363 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008364}
8365
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008366PyObject *
8367PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8368 Py_ssize_t length)
8369{
8370 PyObject *result;
8371 Py_UNICODE *p; /* write pointer into result */
8372 Py_ssize_t i;
8373 /* Copy to a new string */
8374 result = (PyObject *)_PyUnicode_New(length);
8375 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8376 if (result == NULL)
8377 return result;
8378 p = PyUnicode_AS_UNICODE(result);
8379 /* Iterate over code points */
8380 for (i = 0; i < length; i++) {
8381 Py_UNICODE ch =s[i];
8382 if (ch > 127) {
8383 int decimal = Py_UNICODE_TODECIMAL(ch);
8384 if (decimal >= 0)
8385 p[i] = '0' + decimal;
8386 }
8387 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008388#ifndef DONT_MAKE_RESULT_READY
8389 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 Py_DECREF(result);
8391 return NULL;
8392 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008393#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008394 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008395 return result;
8396}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008397/* --- Decimal Encoder ---------------------------------------------------- */
8398
Alexander Belopolsky40018472011-02-26 01:02:56 +00008399int
8400PyUnicode_EncodeDecimal(Py_UNICODE *s,
8401 Py_ssize_t length,
8402 char *output,
8403 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008404{
8405 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 PyObject *errorHandler = NULL;
8407 PyObject *exc = NULL;
8408 const char *encoding = "decimal";
8409 const char *reason = "invalid decimal Unicode string";
8410 /* the following variable is used for caching string comparisons
8411 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8412 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008413
8414 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 PyErr_BadArgument();
8416 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008417 }
8418
8419 p = s;
8420 end = s + length;
8421 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 register Py_UNICODE ch = *p;
8423 int decimal;
8424 PyObject *repunicode;
8425 Py_ssize_t repsize;
8426 Py_ssize_t newpos;
8427 Py_UNICODE *uni2;
8428 Py_UNICODE *collstart;
8429 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008430
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008432 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 ++p;
8434 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 decimal = Py_UNICODE_TODECIMAL(ch);
8437 if (decimal >= 0) {
8438 *output++ = '0' + decimal;
8439 ++p;
8440 continue;
8441 }
8442 if (0 < ch && ch < 256) {
8443 *output++ = (char)ch;
8444 ++p;
8445 continue;
8446 }
8447 /* All other characters are considered unencodable */
8448 collstart = p;
8449 collend = p+1;
8450 while (collend < end) {
8451 if ((0 < *collend && *collend < 256) ||
8452 !Py_UNICODE_ISSPACE(*collend) ||
8453 Py_UNICODE_TODECIMAL(*collend))
8454 break;
8455 }
8456 /* cache callback name lookup
8457 * (if not done yet, i.e. it's the first error) */
8458 if (known_errorHandler==-1) {
8459 if ((errors==NULL) || (!strcmp(errors, "strict")))
8460 known_errorHandler = 1;
8461 else if (!strcmp(errors, "replace"))
8462 known_errorHandler = 2;
8463 else if (!strcmp(errors, "ignore"))
8464 known_errorHandler = 3;
8465 else if (!strcmp(errors, "xmlcharrefreplace"))
8466 known_errorHandler = 4;
8467 else
8468 known_errorHandler = 0;
8469 }
8470 switch (known_errorHandler) {
8471 case 1: /* strict */
8472 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8473 goto onError;
8474 case 2: /* replace */
8475 for (p = collstart; p < collend; ++p)
8476 *output++ = '?';
8477 /* fall through */
8478 case 3: /* ignore */
8479 p = collend;
8480 break;
8481 case 4: /* xmlcharrefreplace */
8482 /* generate replacement (temporarily (mis)uses p) */
8483 for (p = collstart; p < collend; ++p)
8484 output += sprintf(output, "&#%d;", (int)*p);
8485 p = collend;
8486 break;
8487 default:
8488 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8489 encoding, reason, s, length, &exc,
8490 collstart-s, collend-s, &newpos);
8491 if (repunicode == NULL)
8492 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008493 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008494 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008495 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8496 Py_DECREF(repunicode);
8497 goto onError;
8498 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 /* generate replacement */
8500 repsize = PyUnicode_GET_SIZE(repunicode);
8501 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8502 Py_UNICODE ch = *uni2;
8503 if (Py_UNICODE_ISSPACE(ch))
8504 *output++ = ' ';
8505 else {
8506 decimal = Py_UNICODE_TODECIMAL(ch);
8507 if (decimal >= 0)
8508 *output++ = '0' + decimal;
8509 else if (0 < ch && ch < 256)
8510 *output++ = (char)ch;
8511 else {
8512 Py_DECREF(repunicode);
8513 raise_encode_exception(&exc, encoding,
8514 s, length, collstart-s, collend-s, reason);
8515 goto onError;
8516 }
8517 }
8518 }
8519 p = s + newpos;
8520 Py_DECREF(repunicode);
8521 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008522 }
8523 /* 0-terminate the output string */
8524 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008525 Py_XDECREF(exc);
8526 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008527 return 0;
8528
Benjamin Peterson29060642009-01-31 22:14:21 +00008529 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008530 Py_XDECREF(exc);
8531 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008532 return -1;
8533}
8534
Guido van Rossumd57fd912000-03-10 22:53:23 +00008535/* --- Helpers ------------------------------------------------------------ */
8536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008537static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008538any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 Py_ssize_t start,
8540 Py_ssize_t end)
8541{
8542 int kind1, kind2, kind;
8543 void *buf1, *buf2;
8544 Py_ssize_t len1, len2, result;
8545
8546 kind1 = PyUnicode_KIND(s1);
8547 kind2 = PyUnicode_KIND(s2);
8548 kind = kind1 > kind2 ? kind1 : kind2;
8549 buf1 = PyUnicode_DATA(s1);
8550 buf2 = PyUnicode_DATA(s2);
8551 if (kind1 != kind)
8552 buf1 = _PyUnicode_AsKind(s1, kind);
8553 if (!buf1)
8554 return -2;
8555 if (kind2 != kind)
8556 buf2 = _PyUnicode_AsKind(s2, kind);
8557 if (!buf2) {
8558 if (kind1 != kind) PyMem_Free(buf1);
8559 return -2;
8560 }
8561 len1 = PyUnicode_GET_LENGTH(s1);
8562 len2 = PyUnicode_GET_LENGTH(s2);
8563
Victor Stinner794d5672011-10-10 03:21:36 +02008564 if (direction > 0) {
8565 switch(kind) {
8566 case PyUnicode_1BYTE_KIND:
8567 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8568 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8569 else
8570 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8571 break;
8572 case PyUnicode_2BYTE_KIND:
8573 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8574 break;
8575 case PyUnicode_4BYTE_KIND:
8576 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8577 break;
8578 default:
8579 assert(0); result = -2;
8580 }
8581 }
8582 else {
8583 switch(kind) {
8584 case PyUnicode_1BYTE_KIND:
8585 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8586 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8587 else
8588 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8589 break;
8590 case PyUnicode_2BYTE_KIND:
8591 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8592 break;
8593 case PyUnicode_4BYTE_KIND:
8594 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8595 break;
8596 default:
8597 assert(0); result = -2;
8598 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008599 }
8600
8601 if (kind1 != kind)
8602 PyMem_Free(buf1);
8603 if (kind2 != kind)
8604 PyMem_Free(buf2);
8605
8606 return result;
8607}
8608
8609Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008610_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008611 Py_ssize_t n_buffer,
8612 void *digits, Py_ssize_t n_digits,
8613 Py_ssize_t min_width,
8614 const char *grouping,
8615 const char *thousands_sep)
8616{
8617 switch(kind) {
8618 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008619 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8620 return _PyUnicode_ascii_InsertThousandsGrouping(
8621 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8622 min_width, grouping, thousands_sep);
8623 else
8624 return _PyUnicode_ucs1_InsertThousandsGrouping(
8625 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8626 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008627 case PyUnicode_2BYTE_KIND:
8628 return _PyUnicode_ucs2_InsertThousandsGrouping(
8629 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8630 min_width, grouping, thousands_sep);
8631 case PyUnicode_4BYTE_KIND:
8632 return _PyUnicode_ucs4_InsertThousandsGrouping(
8633 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8634 min_width, grouping, thousands_sep);
8635 }
8636 assert(0);
8637 return -1;
8638}
8639
8640
Eric Smith8c663262007-08-25 02:26:07 +00008641#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008642#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008643
Thomas Wouters477c8d52006-05-27 19:21:47 +00008644#include "stringlib/count.h"
8645#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008646
Thomas Wouters477c8d52006-05-27 19:21:47 +00008647/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008648#define ADJUST_INDICES(start, end, len) \
8649 if (end > len) \
8650 end = len; \
8651 else if (end < 0) { \
8652 end += len; \
8653 if (end < 0) \
8654 end = 0; \
8655 } \
8656 if (start < 0) { \
8657 start += len; \
8658 if (start < 0) \
8659 start = 0; \
8660 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008661
Alexander Belopolsky40018472011-02-26 01:02:56 +00008662Py_ssize_t
8663PyUnicode_Count(PyObject *str,
8664 PyObject *substr,
8665 Py_ssize_t start,
8666 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008668 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008669 PyUnicodeObject* str_obj;
8670 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 int kind1, kind2, kind;
8672 void *buf1 = NULL, *buf2 = NULL;
8673 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008674
Thomas Wouters477c8d52006-05-27 19:21:47 +00008675 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008678 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008679 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 Py_DECREF(str_obj);
8681 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
Tim Petersced69f82003-09-16 20:30:58 +00008683
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 kind1 = PyUnicode_KIND(str_obj);
8685 kind2 = PyUnicode_KIND(sub_obj);
8686 kind = kind1 > kind2 ? kind1 : kind2;
8687 buf1 = PyUnicode_DATA(str_obj);
8688 if (kind1 != kind)
8689 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8690 if (!buf1)
8691 goto onError;
8692 buf2 = PyUnicode_DATA(sub_obj);
8693 if (kind2 != kind)
8694 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8695 if (!buf2)
8696 goto onError;
8697 len1 = PyUnicode_GET_LENGTH(str_obj);
8698 len2 = PyUnicode_GET_LENGTH(sub_obj);
8699
8700 ADJUST_INDICES(start, end, len1);
8701 switch(kind) {
8702 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008703 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8704 result = asciilib_count(
8705 ((Py_UCS1*)buf1) + start, end - start,
8706 buf2, len2, PY_SSIZE_T_MAX
8707 );
8708 else
8709 result = ucs1lib_count(
8710 ((Py_UCS1*)buf1) + start, end - start,
8711 buf2, len2, PY_SSIZE_T_MAX
8712 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008713 break;
8714 case PyUnicode_2BYTE_KIND:
8715 result = ucs2lib_count(
8716 ((Py_UCS2*)buf1) + start, end - start,
8717 buf2, len2, PY_SSIZE_T_MAX
8718 );
8719 break;
8720 case PyUnicode_4BYTE_KIND:
8721 result = ucs4lib_count(
8722 ((Py_UCS4*)buf1) + start, end - start,
8723 buf2, len2, PY_SSIZE_T_MAX
8724 );
8725 break;
8726 default:
8727 assert(0); result = 0;
8728 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008729
8730 Py_DECREF(sub_obj);
8731 Py_DECREF(str_obj);
8732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008733 if (kind1 != kind)
8734 PyMem_Free(buf1);
8735 if (kind2 != kind)
8736 PyMem_Free(buf2);
8737
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008739 onError:
8740 Py_DECREF(sub_obj);
8741 Py_DECREF(str_obj);
8742 if (kind1 != kind && buf1)
8743 PyMem_Free(buf1);
8744 if (kind2 != kind && buf2)
8745 PyMem_Free(buf2);
8746 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747}
8748
Alexander Belopolsky40018472011-02-26 01:02:56 +00008749Py_ssize_t
8750PyUnicode_Find(PyObject *str,
8751 PyObject *sub,
8752 Py_ssize_t start,
8753 Py_ssize_t end,
8754 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008755{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008756 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008757
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008761 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 Py_DECREF(str);
8764 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008765 }
Tim Petersced69f82003-09-16 20:30:58 +00008766
Victor Stinner794d5672011-10-10 03:21:36 +02008767 result = any_find_slice(direction,
8768 str, sub, start, end
8769 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008770
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008772 Py_DECREF(sub);
8773
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 return result;
8775}
8776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008777Py_ssize_t
8778PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8779 Py_ssize_t start, Py_ssize_t end,
8780 int direction)
8781{
8782 char *result;
8783 int kind;
8784 if (PyUnicode_READY(str) == -1)
8785 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008786 if (start < 0 || end < 0) {
8787 PyErr_SetString(PyExc_IndexError, "string index out of range");
8788 return -2;
8789 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 if (end > PyUnicode_GET_LENGTH(str))
8791 end = PyUnicode_GET_LENGTH(str);
8792 kind = PyUnicode_KIND(str);
8793 result = findchar(PyUnicode_1BYTE_DATA(str)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008794 + kind*start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 kind,
8796 end-start, ch, direction);
8797 if (!result)
8798 return -1;
8799 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8800}
8801
Alexander Belopolsky40018472011-02-26 01:02:56 +00008802static int
8803tailmatch(PyUnicodeObject *self,
8804 PyUnicodeObject *substring,
8805 Py_ssize_t start,
8806 Py_ssize_t end,
8807 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 int kind_self;
8810 int kind_sub;
8811 void *data_self;
8812 void *data_sub;
8813 Py_ssize_t offset;
8814 Py_ssize_t i;
8815 Py_ssize_t end_sub;
8816
8817 if (PyUnicode_READY(self) == -1 ||
8818 PyUnicode_READY(substring) == -1)
8819 return 0;
8820
8821 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822 return 1;
8823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8825 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008827 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008829 kind_self = PyUnicode_KIND(self);
8830 data_self = PyUnicode_DATA(self);
8831 kind_sub = PyUnicode_KIND(substring);
8832 data_sub = PyUnicode_DATA(substring);
8833 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8834
8835 if (direction > 0)
8836 offset = end;
8837 else
8838 offset = start;
8839
8840 if (PyUnicode_READ(kind_self, data_self, offset) ==
8841 PyUnicode_READ(kind_sub, data_sub, 0) &&
8842 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8843 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8844 /* If both are of the same kind, memcmp is sufficient */
8845 if (kind_self == kind_sub) {
8846 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008847 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008848 data_sub,
8849 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008850 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008851 }
8852 /* otherwise we have to compare each character by first accesing it */
8853 else {
8854 /* We do not need to compare 0 and len(substring)-1 because
8855 the if statement above ensured already that they are equal
8856 when we end up here. */
8857 // TODO: honor direction and do a forward or backwards search
8858 for (i = 1; i < end_sub; ++i) {
8859 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8860 PyUnicode_READ(kind_sub, data_sub, i))
8861 return 0;
8862 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008865 }
8866
8867 return 0;
8868}
8869
Alexander Belopolsky40018472011-02-26 01:02:56 +00008870Py_ssize_t
8871PyUnicode_Tailmatch(PyObject *str,
8872 PyObject *substr,
8873 Py_ssize_t start,
8874 Py_ssize_t end,
8875 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008876{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008877 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008878
Guido van Rossumd57fd912000-03-10 22:53:23 +00008879 str = PyUnicode_FromObject(str);
8880 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008881 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008882 substr = PyUnicode_FromObject(substr);
8883 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 Py_DECREF(str);
8885 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008886 }
Tim Petersced69f82003-09-16 20:30:58 +00008887
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 (PyUnicodeObject *)substr,
8890 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008891 Py_DECREF(str);
8892 Py_DECREF(substr);
8893 return result;
8894}
8895
Guido van Rossumd57fd912000-03-10 22:53:23 +00008896/* Apply fixfct filter to the Unicode object self and return a
8897 reference to the modified object */
8898
Alexander Belopolsky40018472011-02-26 01:02:56 +00008899static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008900fixup(PyObject *self,
8901 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008903 PyObject *u;
8904 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008906 if (PyUnicode_READY(self) == -1)
8907 return NULL;
8908 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8909 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8910 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008911 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008913
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008914 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008915 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008917 /* fix functions return the new maximum character in a string,
8918 if the kind of the resulting unicode object does not change,
8919 everything is fine. Otherwise we need to change the string kind
8920 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008921 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008922 if (maxchar_new == 0)
8923 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8924 else if (maxchar_new <= 127)
8925 maxchar_new = 127;
8926 else if (maxchar_new <= 255)
8927 maxchar_new = 255;
8928 else if (maxchar_new <= 65535)
8929 maxchar_new = 65535;
8930 else
8931 maxchar_new = 1114111; /* 0x10ffff */
8932
8933 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 /* fixfct should return TRUE if it modified the buffer. If
8935 FALSE, return a reference to the original buffer instead
8936 (to save space, not time) */
8937 Py_INCREF(self);
8938 Py_DECREF(u);
8939 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008940 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008941 else if (maxchar_new == maxchar_old) {
8942 return u;
8943 }
8944 else {
8945 /* In case the maximum character changed, we need to
8946 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008947 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 if (v == NULL) {
8949 Py_DECREF(u);
8950 return NULL;
8951 }
8952 if (maxchar_new > maxchar_old) {
8953 /* If the maxchar increased so that the kind changed, not all
8954 characters are representable anymore and we need to fix the
8955 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008956 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008957 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008958 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8959 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008960 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008961 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008962 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963
8964 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008965 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 return v;
8967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968}
8969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008970static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008971fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973 /* No need to call PyUnicode_READY(self) because this function is only
8974 called as a callback from fixup() which does it already. */
8975 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8976 const int kind = PyUnicode_KIND(self);
8977 void *data = PyUnicode_DATA(self);
8978 int touched = 0;
8979 Py_UCS4 maxchar = 0;
8980 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 for (i = 0; i < len; ++i) {
8983 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8984 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8985 if (up != ch) {
8986 if (up > maxchar)
8987 maxchar = up;
8988 PyUnicode_WRITE(kind, data, i, up);
8989 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008991 else if (ch > maxchar)
8992 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993 }
8994
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008995 if (touched)
8996 return maxchar;
8997 else
8998 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999}
9000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009002fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9005 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9006 const int kind = PyUnicode_KIND(self);
9007 void *data = PyUnicode_DATA(self);
9008 int touched = 0;
9009 Py_UCS4 maxchar = 0;
9010 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009012 for(i = 0; i < len; ++i) {
9013 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9014 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9015 if (lo != ch) {
9016 if (lo > maxchar)
9017 maxchar = lo;
9018 PyUnicode_WRITE(kind, data, i, lo);
9019 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009021 else if (ch > maxchar)
9022 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023 }
9024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009025 if (touched)
9026 return maxchar;
9027 else
9028 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029}
9030
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009031static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009032fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009033{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9035 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9036 const int kind = PyUnicode_KIND(self);
9037 void *data = PyUnicode_DATA(self);
9038 int touched = 0;
9039 Py_UCS4 maxchar = 0;
9040 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009042 for(i = 0; i < len; ++i) {
9043 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9044 Py_UCS4 nu = 0;
9045
9046 if (Py_UNICODE_ISUPPER(ch))
9047 nu = Py_UNICODE_TOLOWER(ch);
9048 else if (Py_UNICODE_ISLOWER(ch))
9049 nu = Py_UNICODE_TOUPPER(ch);
9050
9051 if (nu != 0) {
9052 if (nu > maxchar)
9053 maxchar = nu;
9054 PyUnicode_WRITE(kind, data, i, nu);
9055 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 else if (ch > maxchar)
9058 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 }
9060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (touched)
9062 return maxchar;
9063 else
9064 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065}
9066
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009067static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009068fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9071 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9072 const int kind = PyUnicode_KIND(self);
9073 void *data = PyUnicode_DATA(self);
9074 int touched = 0;
9075 Py_UCS4 maxchar = 0;
9076 Py_ssize_t i = 0;
9077 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009078
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009079 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081
9082 ch = PyUnicode_READ(kind, data, i);
9083 if (!Py_UNICODE_ISUPPER(ch)) {
9084 maxchar = Py_UNICODE_TOUPPER(ch);
9085 PyUnicode_WRITE(kind, data, i, maxchar);
9086 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009087 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 ++i;
9089 for(; i < len; ++i) {
9090 ch = PyUnicode_READ(kind, data, i);
9091 if (!Py_UNICODE_ISLOWER(ch)) {
9092 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9093 if (lo > maxchar)
9094 maxchar = lo;
9095 PyUnicode_WRITE(kind, data, i, lo);
9096 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009097 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098 else if (ch > maxchar)
9099 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009100 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009101
9102 if (touched)
9103 return maxchar;
9104 else
9105 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106}
9107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009109fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009110{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9112 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9113 const int kind = PyUnicode_KIND(self);
9114 void *data = PyUnicode_DATA(self);
9115 Py_UCS4 maxchar = 0;
9116 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009117 int previous_is_cased;
9118
9119 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009120 if (len == 1) {
9121 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9122 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9123 if (ti != ch) {
9124 PyUnicode_WRITE(kind, data, i, ti);
9125 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 }
9127 else
9128 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009129 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009130 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009131 for(; i < len; ++i) {
9132 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9133 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009134
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 nu = Py_UNICODE_TOTITLE(ch);
9139
9140 if (nu > maxchar)
9141 maxchar = nu;
9142 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009143
Benjamin Peterson29060642009-01-31 22:14:21 +00009144 if (Py_UNICODE_ISLOWER(ch) ||
9145 Py_UNICODE_ISUPPER(ch) ||
9146 Py_UNICODE_ISTITLE(ch))
9147 previous_is_cased = 1;
9148 else
9149 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009151 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152}
9153
Tim Peters8ce9f162004-08-27 01:49:32 +00009154PyObject *
9155PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009156{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009158 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009160 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009161 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9162 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009163 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009165 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009167 int use_memcpy;
9168 unsigned char *res_data = NULL, *sep_data = NULL;
9169 PyObject *last_obj;
9170 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009171
Tim Peters05eba1f2004-08-27 21:32:02 +00009172 fseq = PySequence_Fast(seq, "");
9173 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009174 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009175 }
9176
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009177 /* NOTE: the following code can't call back into Python code,
9178 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009179 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009180
Tim Peters05eba1f2004-08-27 21:32:02 +00009181 seqlen = PySequence_Fast_GET_SIZE(fseq);
9182 /* If empty sequence, return u"". */
9183 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009184 Py_DECREF(fseq);
9185 Py_INCREF(unicode_empty);
9186 res = unicode_empty;
9187 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009188 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009189
Tim Peters05eba1f2004-08-27 21:32:02 +00009190 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009191 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009192 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009193 if (seqlen == 1) {
9194 if (PyUnicode_CheckExact(items[0])) {
9195 res = items[0];
9196 Py_INCREF(res);
9197 Py_DECREF(fseq);
9198 return res;
9199 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009200 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009201 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009202 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009203 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009204 /* Set up sep and seplen */
9205 if (separator == NULL) {
9206 /* fall back to a blank space separator */
9207 sep = PyUnicode_FromOrdinal(' ');
9208 if (!sep)
9209 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009210 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009211 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009212 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009213 else {
9214 if (!PyUnicode_Check(separator)) {
9215 PyErr_Format(PyExc_TypeError,
9216 "separator: expected str instance,"
9217 " %.80s found",
9218 Py_TYPE(separator)->tp_name);
9219 goto onError;
9220 }
9221 if (PyUnicode_READY(separator))
9222 goto onError;
9223 sep = separator;
9224 seplen = PyUnicode_GET_LENGTH(separator);
9225 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9226 /* inc refcount to keep this code path symmetric with the
9227 above case of a blank separator */
9228 Py_INCREF(sep);
9229 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009230 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009231 }
9232
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009233 /* There are at least two things to join, or else we have a subclass
9234 * of str in the sequence.
9235 * Do a pre-pass to figure out the total amount of space we'll
9236 * need (sz), and see whether all argument are strings.
9237 */
9238 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009239#ifdef Py_DEBUG
9240 use_memcpy = 0;
9241#else
9242 use_memcpy = 1;
9243#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009244 for (i = 0; i < seqlen; i++) {
9245 const Py_ssize_t old_sz = sz;
9246 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009247 if (!PyUnicode_Check(item)) {
9248 PyErr_Format(PyExc_TypeError,
9249 "sequence item %zd: expected str instance,"
9250 " %.80s found",
9251 i, Py_TYPE(item)->tp_name);
9252 goto onError;
9253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 if (PyUnicode_READY(item) == -1)
9255 goto onError;
9256 sz += PyUnicode_GET_LENGTH(item);
9257 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009258 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009259 if (i != 0)
9260 sz += seplen;
9261 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9262 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009263 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009264 goto onError;
9265 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009266 if (use_memcpy && last_obj != NULL) {
9267 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9268 use_memcpy = 0;
9269 }
9270 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009271 }
Tim Petersced69f82003-09-16 20:30:58 +00009272
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009274 if (res == NULL)
9275 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009276
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009277 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009278#ifdef Py_DEBUG
9279 use_memcpy = 0;
9280#else
9281 if (use_memcpy) {
9282 res_data = PyUnicode_1BYTE_DATA(res);
9283 kind = PyUnicode_KIND(res);
9284 if (seplen != 0)
9285 sep_data = PyUnicode_1BYTE_DATA(sep);
9286 }
9287#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009289 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009290 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009291 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009292 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009293 if (use_memcpy) {
9294 Py_MEMCPY(res_data,
9295 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009296 kind * seplen);
9297 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009298 }
9299 else {
9300 copy_characters(res, res_offset, sep, 0, seplen);
9301 res_offset += seplen;
9302 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009303 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009304 itemlen = PyUnicode_GET_LENGTH(item);
9305 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009306 if (use_memcpy) {
9307 Py_MEMCPY(res_data,
9308 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009309 kind * itemlen);
9310 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009311 }
9312 else {
9313 copy_characters(res, res_offset, item, 0, itemlen);
9314 res_offset += itemlen;
9315 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009316 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009317 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009318 if (use_memcpy)
9319 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009320 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009321 else
9322 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009323
Tim Peters05eba1f2004-08-27 21:32:02 +00009324 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009326 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328
Benjamin Peterson29060642009-01-31 22:14:21 +00009329 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009330 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009332 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333 return NULL;
9334}
9335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009336#define FILL(kind, data, value, start, length) \
9337 do { \
9338 Py_ssize_t i_ = 0; \
9339 assert(kind != PyUnicode_WCHAR_KIND); \
9340 switch ((kind)) { \
9341 case PyUnicode_1BYTE_KIND: { \
9342 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9343 memset(to_, (unsigned char)value, length); \
9344 break; \
9345 } \
9346 case PyUnicode_2BYTE_KIND: { \
9347 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9348 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9349 break; \
9350 } \
9351 default: { \
9352 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9353 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9354 break; \
9355 } \
9356 } \
9357 } while (0)
9358
Victor Stinner9310abb2011-10-05 00:59:23 +02009359static PyObject *
9360pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009361 Py_ssize_t left,
9362 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 PyObject *u;
9366 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009367 int kind;
9368 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009369
9370 if (left < 0)
9371 left = 0;
9372 if (right < 0)
9373 right = 0;
9374
Tim Peters7a29bd52001-09-12 03:03:31 +00009375 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009376 Py_INCREF(self);
9377 return self;
9378 }
9379
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9381 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009382 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9383 return NULL;
9384 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9386 if (fill > maxchar)
9387 maxchar = fill;
9388 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009389 if (!u)
9390 return NULL;
9391
9392 kind = PyUnicode_KIND(u);
9393 data = PyUnicode_DATA(u);
9394 if (left)
9395 FILL(kind, data, fill, 0, left);
9396 if (right)
9397 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009398 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009399 assert(_PyUnicode_CheckConsistency(u, 1));
9400 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Alexander Belopolsky40018472011-02-26 01:02:56 +00009404PyObject *
9405PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009406{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009407 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408
9409 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009410 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009411 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 switch(PyUnicode_KIND(string)) {
9414 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009415 if (PyUnicode_IS_ASCII(string))
9416 list = asciilib_splitlines(
9417 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9418 PyUnicode_GET_LENGTH(string), keepends);
9419 else
9420 list = ucs1lib_splitlines(
9421 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9422 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 break;
9424 case PyUnicode_2BYTE_KIND:
9425 list = ucs2lib_splitlines(
9426 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9427 PyUnicode_GET_LENGTH(string), keepends);
9428 break;
9429 case PyUnicode_4BYTE_KIND:
9430 list = ucs4lib_splitlines(
9431 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9432 PyUnicode_GET_LENGTH(string), keepends);
9433 break;
9434 default:
9435 assert(0);
9436 list = 0;
9437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009438 Py_DECREF(string);
9439 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440}
9441
Alexander Belopolsky40018472011-02-26 01:02:56 +00009442static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009443split(PyObject *self,
9444 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009445 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009446{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 int kind1, kind2, kind;
9448 void *buf1, *buf2;
9449 Py_ssize_t len1, len2;
9450 PyObject* out;
9451
Guido van Rossumd57fd912000-03-10 22:53:23 +00009452 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009453 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009455 if (PyUnicode_READY(self) == -1)
9456 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458 if (substring == NULL)
9459 switch(PyUnicode_KIND(self)) {
9460 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009461 if (PyUnicode_IS_ASCII(self))
9462 return asciilib_split_whitespace(
9463 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9464 PyUnicode_GET_LENGTH(self), maxcount
9465 );
9466 else
9467 return ucs1lib_split_whitespace(
9468 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9469 PyUnicode_GET_LENGTH(self), maxcount
9470 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009471 case PyUnicode_2BYTE_KIND:
9472 return ucs2lib_split_whitespace(
9473 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9474 PyUnicode_GET_LENGTH(self), maxcount
9475 );
9476 case PyUnicode_4BYTE_KIND:
9477 return ucs4lib_split_whitespace(
9478 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9479 PyUnicode_GET_LENGTH(self), maxcount
9480 );
9481 default:
9482 assert(0);
9483 return NULL;
9484 }
9485
9486 if (PyUnicode_READY(substring) == -1)
9487 return NULL;
9488
9489 kind1 = PyUnicode_KIND(self);
9490 kind2 = PyUnicode_KIND(substring);
9491 kind = kind1 > kind2 ? kind1 : kind2;
9492 buf1 = PyUnicode_DATA(self);
9493 buf2 = PyUnicode_DATA(substring);
9494 if (kind1 != kind)
9495 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9496 if (!buf1)
9497 return NULL;
9498 if (kind2 != kind)
9499 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9500 if (!buf2) {
9501 if (kind1 != kind) PyMem_Free(buf1);
9502 return NULL;
9503 }
9504 len1 = PyUnicode_GET_LENGTH(self);
9505 len2 = PyUnicode_GET_LENGTH(substring);
9506
9507 switch(kind) {
9508 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009509 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9510 out = asciilib_split(
9511 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9512 else
9513 out = ucs1lib_split(
9514 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009515 break;
9516 case PyUnicode_2BYTE_KIND:
9517 out = ucs2lib_split(
9518 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9519 break;
9520 case PyUnicode_4BYTE_KIND:
9521 out = ucs4lib_split(
9522 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9523 break;
9524 default:
9525 out = NULL;
9526 }
9527 if (kind1 != kind)
9528 PyMem_Free(buf1);
9529 if (kind2 != kind)
9530 PyMem_Free(buf2);
9531 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532}
9533
Alexander Belopolsky40018472011-02-26 01:02:56 +00009534static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009535rsplit(PyObject *self,
9536 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009537 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009538{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009539 int kind1, kind2, kind;
9540 void *buf1, *buf2;
9541 Py_ssize_t len1, len2;
9542 PyObject* out;
9543
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009544 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009545 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009547 if (PyUnicode_READY(self) == -1)
9548 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 if (substring == NULL)
9551 switch(PyUnicode_KIND(self)) {
9552 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009553 if (PyUnicode_IS_ASCII(self))
9554 return asciilib_rsplit_whitespace(
9555 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9556 PyUnicode_GET_LENGTH(self), maxcount
9557 );
9558 else
9559 return ucs1lib_rsplit_whitespace(
9560 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9561 PyUnicode_GET_LENGTH(self), maxcount
9562 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 case PyUnicode_2BYTE_KIND:
9564 return ucs2lib_rsplit_whitespace(
9565 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9566 PyUnicode_GET_LENGTH(self), maxcount
9567 );
9568 case PyUnicode_4BYTE_KIND:
9569 return ucs4lib_rsplit_whitespace(
9570 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9571 PyUnicode_GET_LENGTH(self), maxcount
9572 );
9573 default:
9574 assert(0);
9575 return NULL;
9576 }
9577
9578 if (PyUnicode_READY(substring) == -1)
9579 return NULL;
9580
9581 kind1 = PyUnicode_KIND(self);
9582 kind2 = PyUnicode_KIND(substring);
9583 kind = kind1 > kind2 ? kind1 : kind2;
9584 buf1 = PyUnicode_DATA(self);
9585 buf2 = PyUnicode_DATA(substring);
9586 if (kind1 != kind)
9587 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9588 if (!buf1)
9589 return NULL;
9590 if (kind2 != kind)
9591 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9592 if (!buf2) {
9593 if (kind1 != kind) PyMem_Free(buf1);
9594 return NULL;
9595 }
9596 len1 = PyUnicode_GET_LENGTH(self);
9597 len2 = PyUnicode_GET_LENGTH(substring);
9598
9599 switch(kind) {
9600 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009601 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9602 out = asciilib_rsplit(
9603 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9604 else
9605 out = ucs1lib_rsplit(
9606 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607 break;
9608 case PyUnicode_2BYTE_KIND:
9609 out = ucs2lib_rsplit(
9610 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9611 break;
9612 case PyUnicode_4BYTE_KIND:
9613 out = ucs4lib_rsplit(
9614 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9615 break;
9616 default:
9617 out = NULL;
9618 }
9619 if (kind1 != kind)
9620 PyMem_Free(buf1);
9621 if (kind2 != kind)
9622 PyMem_Free(buf2);
9623 return out;
9624}
9625
9626static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009627anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9628 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629{
9630 switch(kind) {
9631 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009632 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9633 return asciilib_find(buf1, len1, buf2, len2, offset);
9634 else
9635 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 case PyUnicode_2BYTE_KIND:
9637 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9638 case PyUnicode_4BYTE_KIND:
9639 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9640 }
9641 assert(0);
9642 return -1;
9643}
9644
9645static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009646anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9647 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648{
9649 switch(kind) {
9650 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009651 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9652 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9653 else
9654 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655 case PyUnicode_2BYTE_KIND:
9656 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9657 case PyUnicode_4BYTE_KIND:
9658 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9659 }
9660 assert(0);
9661 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009662}
9663
Alexander Belopolsky40018472011-02-26 01:02:56 +00009664static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665replace(PyObject *self, PyObject *str1,
9666 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 PyObject *u;
9669 char *sbuf = PyUnicode_DATA(self);
9670 char *buf1 = PyUnicode_DATA(str1);
9671 char *buf2 = PyUnicode_DATA(str2);
9672 int srelease = 0, release1 = 0, release2 = 0;
9673 int skind = PyUnicode_KIND(self);
9674 int kind1 = PyUnicode_KIND(str1);
9675 int kind2 = PyUnicode_KIND(str2);
9676 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9677 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9678 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009679 int mayshrink;
9680 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681
9682 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009683 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009685 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686
Victor Stinner59de0ee2011-10-07 10:01:28 +02009687 if (str1 == str2)
9688 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 if (skind < kind1)
9690 /* substring too wide to be present */
9691 goto nothing;
9692
Victor Stinner49a0a212011-10-12 23:46:10 +02009693 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9694 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9695 /* Replacing str1 with str2 may cause a maxchar reduction in the
9696 result string. */
9697 mayshrink = (maxchar_str2 < maxchar);
9698 maxchar = Py_MAX(maxchar, maxchar_str2);
9699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009701 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009702 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009704 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009706 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009707 Py_UCS4 u1, u2;
9708 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009709 u1 = PyUnicode_READ_CHAR(str1, 0);
9710 if (!findchar(sbuf, PyUnicode_KIND(self),
9711 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009712 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009715 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009717 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 rkind = PyUnicode_KIND(u);
9719 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9720 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009721 if (--maxcount < 0)
9722 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009723 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009724 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009725 }
9726 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727 int rkind = skind;
9728 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 if (kind1 < rkind) {
9731 /* widen substring */
9732 buf1 = _PyUnicode_AsKind(str1, rkind);
9733 if (!buf1) goto error;
9734 release1 = 1;
9735 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009736 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009737 if (i < 0)
9738 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009739 if (rkind > kind2) {
9740 /* widen replacement */
9741 buf2 = _PyUnicode_AsKind(str2, rkind);
9742 if (!buf2) goto error;
9743 release2 = 1;
9744 }
9745 else if (rkind < kind2) {
9746 /* widen self and buf1 */
9747 rkind = kind2;
9748 if (release1) PyMem_Free(buf1);
9749 sbuf = _PyUnicode_AsKind(self, rkind);
9750 if (!sbuf) goto error;
9751 srelease = 1;
9752 buf1 = _PyUnicode_AsKind(str1, rkind);
9753 if (!buf1) goto error;
9754 release1 = 1;
9755 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009756 u = PyUnicode_New(slen, maxchar);
9757 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009758 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009759 assert(PyUnicode_KIND(u) == rkind);
9760 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009761
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009762 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009763 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009764 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009765 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009766 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009768
9769 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009770 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009771 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009773 if (i == -1)
9774 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009775 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009776 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009779 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009780 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009781 }
9782 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 Py_ssize_t n, i, j, ires;
9784 Py_ssize_t product, new_size;
9785 int rkind = skind;
9786 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009787
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009789 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 buf1 = _PyUnicode_AsKind(str1, rkind);
9791 if (!buf1) goto error;
9792 release1 = 1;
9793 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009794 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009795 if (n == 0)
9796 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009798 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 buf2 = _PyUnicode_AsKind(str2, rkind);
9800 if (!buf2) goto error;
9801 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009803 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009804 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 rkind = kind2;
9806 sbuf = _PyUnicode_AsKind(self, rkind);
9807 if (!sbuf) goto error;
9808 srelease = 1;
9809 if (release1) PyMem_Free(buf1);
9810 buf1 = _PyUnicode_AsKind(str1, rkind);
9811 if (!buf1) goto error;
9812 release1 = 1;
9813 }
9814 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9815 PyUnicode_GET_LENGTH(str1))); */
9816 product = n * (len2-len1);
9817 if ((product / (len2-len1)) != n) {
9818 PyErr_SetString(PyExc_OverflowError,
9819 "replace string is too long");
9820 goto error;
9821 }
9822 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +02009823 if (new_size == 0) {
9824 Py_INCREF(unicode_empty);
9825 u = unicode_empty;
9826 goto done;
9827 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009828 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9829 PyErr_SetString(PyExc_OverflowError,
9830 "replace string is too long");
9831 goto error;
9832 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009833 u = PyUnicode_New(new_size, maxchar);
9834 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009836 assert(PyUnicode_KIND(u) == rkind);
9837 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009838 ires = i = 0;
9839 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009840 while (n-- > 0) {
9841 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009842 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009843 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009844 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009845 if (j == -1)
9846 break;
9847 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009848 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009849 memcpy(res + rkind * ires,
9850 sbuf + rkind * i,
9851 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009852 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009853 }
9854 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009856 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009858 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009864 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009865 memcpy(res + rkind * ires,
9866 sbuf + rkind * i,
9867 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +02009868 }
9869 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009870 /* interleave */
9871 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009872 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009873 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009874 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009876 if (--n <= 0)
9877 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009878 memcpy(res + rkind * ires,
9879 sbuf + rkind * i,
9880 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 ires++;
9882 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009883 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009884 memcpy(res + rkind * ires,
9885 sbuf + rkind * i,
9886 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009887 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009888 }
9889
9890 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009891 unicode_adjust_maxchar(&u);
9892 if (u == NULL)
9893 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009895
9896 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009897 if (srelease)
9898 PyMem_FREE(sbuf);
9899 if (release1)
9900 PyMem_FREE(buf1);
9901 if (release2)
9902 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009903 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009905
Benjamin Peterson29060642009-01-31 22:14:21 +00009906 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009907 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009908 if (srelease)
9909 PyMem_FREE(sbuf);
9910 if (release1)
9911 PyMem_FREE(buf1);
9912 if (release2)
9913 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009914 if (PyUnicode_CheckExact(self)) {
9915 Py_INCREF(self);
9916 return (PyObject *) self;
9917 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009918 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 error:
9920 if (srelease && sbuf)
9921 PyMem_FREE(sbuf);
9922 if (release1 && buf1)
9923 PyMem_FREE(buf1);
9924 if (release2 && buf2)
9925 PyMem_FREE(buf2);
9926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927}
9928
9929/* --- Unicode Object Methods --------------------------------------------- */
9930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009931PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009932 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933\n\
9934Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009935characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009936
9937static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009938unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009940 return fixup(self, fixtitle);
9941}
9942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009943PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009944 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945\n\
9946Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009947have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009948
9949static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009950unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009952 return fixup(self, fixcapitalize);
9953}
9954
9955#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009956PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009957 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958\n\
9959Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009960normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961
9962static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009963unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964{
9965 PyObject *list;
9966 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009967 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
Guido van Rossumd57fd912000-03-10 22:53:23 +00009969 /* Split into words */
9970 list = split(self, NULL, -1);
9971 if (!list)
9972 return NULL;
9973
9974 /* Capitalize each word */
9975 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9976 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978 if (item == NULL)
9979 goto onError;
9980 Py_DECREF(PyList_GET_ITEM(list, i));
9981 PyList_SET_ITEM(list, i, item);
9982 }
9983
9984 /* Join the words to form a new string */
9985 item = PyUnicode_Join(NULL, list);
9986
Benjamin Peterson29060642009-01-31 22:14:21 +00009987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009988 Py_DECREF(list);
9989 return (PyObject *)item;
9990}
9991#endif
9992
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009993/* Argument converter. Coerces to a single unicode character */
9994
9995static int
9996convert_uc(PyObject *obj, void *addr)
9997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009998 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009999 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010000
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001 uniobj = PyUnicode_FromObject(obj);
10002 if (uniobj == NULL) {
10003 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010004 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010005 return 0;
10006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010008 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010009 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 Py_DECREF(uniobj);
10011 return 0;
10012 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010014 Py_DECREF(uniobj);
10015 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010016}
10017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010018PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010019 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010021Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010022done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023
10024static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010025unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010026{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010027 Py_ssize_t marg, left;
10028 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 Py_UCS4 fillchar = ' ';
10030
Victor Stinnere9a29352011-10-01 02:14:59 +020010031 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033
Victor Stinnere9a29352011-10-01 02:14:59 +020010034 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035 return NULL;
10036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038 Py_INCREF(self);
10039 return (PyObject*) self;
10040 }
10041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010043 left = marg / 2 + (marg & width & 1);
10044
Victor Stinner9310abb2011-10-05 00:59:23 +020010045 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046}
10047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048/* This function assumes that str1 and str2 are readied by the caller. */
10049
Marc-André Lemburge5034372000-08-08 08:04:29 +000010050static int
10051unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10052{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 int kind1, kind2;
10054 void *data1, *data2;
10055 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010057 kind1 = PyUnicode_KIND(str1);
10058 kind2 = PyUnicode_KIND(str2);
10059 data1 = PyUnicode_DATA(str1);
10060 data2 = PyUnicode_DATA(str2);
10061 len1 = PyUnicode_GET_LENGTH(str1);
10062 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010063
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 for (i = 0; i < len1 && i < len2; ++i) {
10065 Py_UCS4 c1, c2;
10066 c1 = PyUnicode_READ(kind1, data1, i);
10067 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010068
10069 if (c1 != c2)
10070 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010071 }
10072
10073 return (len1 < len2) ? -1 : (len1 != len2);
10074}
10075
Alexander Belopolsky40018472011-02-26 01:02:56 +000010076int
10077PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010078{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010079 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10080 if (PyUnicode_READY(left) == -1 ||
10081 PyUnicode_READY(right) == -1)
10082 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010083 return unicode_compare((PyUnicodeObject *)left,
10084 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010086 PyErr_Format(PyExc_TypeError,
10087 "Can't compare %.100s and %.100s",
10088 left->ob_type->tp_name,
10089 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090 return -1;
10091}
10092
Martin v. Löwis5b222132007-06-10 09:51:05 +000010093int
10094PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 Py_ssize_t i;
10097 int kind;
10098 void *data;
10099 Py_UCS4 chr;
10100
Victor Stinner910337b2011-10-03 03:20:16 +020010101 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 if (PyUnicode_READY(uni) == -1)
10103 return -1;
10104 kind = PyUnicode_KIND(uni);
10105 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010106 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10108 if (chr != str[i])
10109 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010110 /* This check keeps Python strings that end in '\0' from comparing equal
10111 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010114 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010116 return 0;
10117}
10118
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010119
Benjamin Peterson29060642009-01-31 22:14:21 +000010120#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010121 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010122
Alexander Belopolsky40018472011-02-26 01:02:56 +000010123PyObject *
10124PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010125{
10126 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010127
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010128 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10129 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 if (PyUnicode_READY(left) == -1 ||
10131 PyUnicode_READY(right) == -1)
10132 return NULL;
10133 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10134 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010135 if (op == Py_EQ) {
10136 Py_INCREF(Py_False);
10137 return Py_False;
10138 }
10139 if (op == Py_NE) {
10140 Py_INCREF(Py_True);
10141 return Py_True;
10142 }
10143 }
10144 if (left == right)
10145 result = 0;
10146 else
10147 result = unicode_compare((PyUnicodeObject *)left,
10148 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010149
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010150 /* Convert the return value to a Boolean */
10151 switch (op) {
10152 case Py_EQ:
10153 v = TEST_COND(result == 0);
10154 break;
10155 case Py_NE:
10156 v = TEST_COND(result != 0);
10157 break;
10158 case Py_LE:
10159 v = TEST_COND(result <= 0);
10160 break;
10161 case Py_GE:
10162 v = TEST_COND(result >= 0);
10163 break;
10164 case Py_LT:
10165 v = TEST_COND(result == -1);
10166 break;
10167 case Py_GT:
10168 v = TEST_COND(result == 1);
10169 break;
10170 default:
10171 PyErr_BadArgument();
10172 return NULL;
10173 }
10174 Py_INCREF(v);
10175 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010176 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010177
Brian Curtindfc80e32011-08-10 20:28:54 -050010178 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010179}
10180
Alexander Belopolsky40018472011-02-26 01:02:56 +000010181int
10182PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010183{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010184 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 int kind1, kind2, kind;
10186 void *buf1, *buf2;
10187 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010188 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010189
10190 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010191 sub = PyUnicode_FromObject(element);
10192 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010193 PyErr_Format(PyExc_TypeError,
10194 "'in <string>' requires string as left operand, not %s",
10195 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010196 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 if (PyUnicode_READY(sub) == -1)
10199 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010200
Thomas Wouters477c8d52006-05-27 19:21:47 +000010201 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010202 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010203 Py_DECREF(sub);
10204 return -1;
10205 }
10206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010207 kind1 = PyUnicode_KIND(str);
10208 kind2 = PyUnicode_KIND(sub);
10209 kind = kind1 > kind2 ? kind1 : kind2;
10210 buf1 = PyUnicode_DATA(str);
10211 buf2 = PyUnicode_DATA(sub);
10212 if (kind1 != kind)
10213 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10214 if (!buf1) {
10215 Py_DECREF(sub);
10216 return -1;
10217 }
10218 if (kind2 != kind)
10219 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10220 if (!buf2) {
10221 Py_DECREF(sub);
10222 if (kind1 != kind) PyMem_Free(buf1);
10223 return -1;
10224 }
10225 len1 = PyUnicode_GET_LENGTH(str);
10226 len2 = PyUnicode_GET_LENGTH(sub);
10227
10228 switch(kind) {
10229 case PyUnicode_1BYTE_KIND:
10230 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10231 break;
10232 case PyUnicode_2BYTE_KIND:
10233 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10234 break;
10235 case PyUnicode_4BYTE_KIND:
10236 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10237 break;
10238 default:
10239 result = -1;
10240 assert(0);
10241 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242
10243 Py_DECREF(str);
10244 Py_DECREF(sub);
10245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 if (kind1 != kind)
10247 PyMem_Free(buf1);
10248 if (kind2 != kind)
10249 PyMem_Free(buf2);
10250
Guido van Rossum403d68b2000-03-13 15:55:09 +000010251 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010252}
10253
Guido van Rossumd57fd912000-03-10 22:53:23 +000010254/* Concat to string or Unicode object giving a new Unicode object. */
10255
Alexander Belopolsky40018472011-02-26 01:02:56 +000010256PyObject *
10257PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 PyObject *u = NULL, *v = NULL, *w;
10260 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
10262 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010263 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010265 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269
10270 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010271 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010272 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010275 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010276 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010278 }
10279
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010281 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 w = PyUnicode_New(
10285 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10286 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010288 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010289 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10290 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291 Py_DECREF(u);
10292 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010293 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010295
Benjamin Peterson29060642009-01-31 22:14:21 +000010296 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297 Py_XDECREF(u);
10298 Py_XDECREF(v);
10299 return NULL;
10300}
10301
Victor Stinnerb0923652011-10-04 01:17:31 +020010302static void
10303unicode_append_inplace(PyObject **p_left, PyObject *right)
10304{
10305 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010306
10307 assert(PyUnicode_IS_READY(*p_left));
10308 assert(PyUnicode_IS_READY(right));
10309
10310 left_len = PyUnicode_GET_LENGTH(*p_left);
10311 right_len = PyUnicode_GET_LENGTH(right);
10312 if (left_len > PY_SSIZE_T_MAX - right_len) {
10313 PyErr_SetString(PyExc_OverflowError,
10314 "strings are too large to concat");
10315 goto error;
10316 }
10317 new_len = left_len + right_len;
10318
10319 /* Now we own the last reference to 'left', so we can resize it
10320 * in-place.
10321 */
10322 if (unicode_resize(p_left, new_len) != 0) {
10323 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10324 * deallocated so it cannot be put back into
10325 * 'variable'. The MemoryError is raised when there
10326 * is no value in 'variable', which might (very
10327 * remotely) be a cause of incompatibilities.
10328 */
10329 goto error;
10330 }
10331 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010332 copy_characters(*p_left, left_len, right, 0, right_len);
10333 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010334 return;
10335
10336error:
10337 Py_DECREF(*p_left);
10338 *p_left = NULL;
10339}
10340
Walter Dörwald1ab83302007-05-18 17:15:44 +000010341void
Victor Stinner23e56682011-10-03 03:54:37 +020010342PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010343{
Victor Stinner23e56682011-10-03 03:54:37 +020010344 PyObject *left, *res;
10345
10346 if (p_left == NULL) {
10347 if (!PyErr_Occurred())
10348 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010349 return;
10350 }
Victor Stinner23e56682011-10-03 03:54:37 +020010351 left = *p_left;
10352 if (right == NULL || !PyUnicode_Check(left)) {
10353 if (!PyErr_Occurred())
10354 PyErr_BadInternalCall();
10355 goto error;
10356 }
10357
Victor Stinnere1335c72011-10-04 20:53:03 +020010358 if (PyUnicode_READY(left))
10359 goto error;
10360 if (PyUnicode_READY(right))
10361 goto error;
10362
Victor Stinner23e56682011-10-03 03:54:37 +020010363 if (PyUnicode_CheckExact(left) && left != unicode_empty
10364 && PyUnicode_CheckExact(right) && right != unicode_empty
10365 && unicode_resizable(left)
10366 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10367 || _PyUnicode_WSTR(left) != NULL))
10368 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010369 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10370 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010371 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010372 not so different than duplicating the string. */
10373 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010374 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010375 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010376 if (p_left != NULL)
10377 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010378 return;
10379 }
10380 }
10381
10382 res = PyUnicode_Concat(left, right);
10383 if (res == NULL)
10384 goto error;
10385 Py_DECREF(left);
10386 *p_left = res;
10387 return;
10388
10389error:
10390 Py_DECREF(*p_left);
10391 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010392}
10393
10394void
10395PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10396{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010397 PyUnicode_Append(pleft, right);
10398 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010399}
10400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010401PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010402 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010404Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010405string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010406interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
10408static PyObject *
10409unicode_count(PyUnicodeObject *self, PyObject *args)
10410{
10411 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010412 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010413 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 int kind1, kind2, kind;
10416 void *buf1, *buf2;
10417 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
Jesus Ceaac451502011-04-20 17:09:23 +020010419 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10420 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010422
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 kind1 = PyUnicode_KIND(self);
10424 kind2 = PyUnicode_KIND(substring);
10425 kind = kind1 > kind2 ? kind1 : kind2;
10426 buf1 = PyUnicode_DATA(self);
10427 buf2 = PyUnicode_DATA(substring);
10428 if (kind1 != kind)
10429 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10430 if (!buf1) {
10431 Py_DECREF(substring);
10432 return NULL;
10433 }
10434 if (kind2 != kind)
10435 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10436 if (!buf2) {
10437 Py_DECREF(substring);
10438 if (kind1 != kind) PyMem_Free(buf1);
10439 return NULL;
10440 }
10441 len1 = PyUnicode_GET_LENGTH(self);
10442 len2 = PyUnicode_GET_LENGTH(substring);
10443
10444 ADJUST_INDICES(start, end, len1);
10445 switch(kind) {
10446 case PyUnicode_1BYTE_KIND:
10447 iresult = ucs1lib_count(
10448 ((Py_UCS1*)buf1) + start, end - start,
10449 buf2, len2, PY_SSIZE_T_MAX
10450 );
10451 break;
10452 case PyUnicode_2BYTE_KIND:
10453 iresult = ucs2lib_count(
10454 ((Py_UCS2*)buf1) + start, end - start,
10455 buf2, len2, PY_SSIZE_T_MAX
10456 );
10457 break;
10458 case PyUnicode_4BYTE_KIND:
10459 iresult = ucs4lib_count(
10460 ((Py_UCS4*)buf1) + start, end - start,
10461 buf2, len2, PY_SSIZE_T_MAX
10462 );
10463 break;
10464 default:
10465 assert(0); iresult = 0;
10466 }
10467
10468 result = PyLong_FromSsize_t(iresult);
10469
10470 if (kind1 != kind)
10471 PyMem_Free(buf1);
10472 if (kind2 != kind)
10473 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010474
10475 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477 return result;
10478}
10479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010480PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010481 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010483Encode S using the codec registered for encoding. Default encoding\n\
10484is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010485handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010486a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10487'xmlcharrefreplace' as well as any other name registered with\n\
10488codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489
10490static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010491unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010493 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494 char *encoding = NULL;
10495 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010496
Benjamin Peterson308d6372009-09-18 21:42:35 +000010497 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10498 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010499 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010500 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010501}
10502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010503PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010504 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505\n\
10506Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010507If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508
10509static PyObject*
10510unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10511{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010512 Py_ssize_t i, j, line_pos, src_len, incr;
10513 Py_UCS4 ch;
10514 PyObject *u;
10515 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010516 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010517 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010518 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519
10520 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010521 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
Antoine Pitrou22425222011-10-04 19:10:51 +020010523 if (PyUnicode_READY(self) == -1)
10524 return NULL;
10525
Thomas Wouters7e474022000-07-16 12:04:32 +000010526 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010527 src_len = PyUnicode_GET_LENGTH(self);
10528 i = j = line_pos = 0;
10529 kind = PyUnicode_KIND(self);
10530 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010531 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010532 for (; i < src_len; i++) {
10533 ch = PyUnicode_READ(kind, src_data, i);
10534 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010535 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010537 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010539 goto overflow;
10540 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010542 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010543 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010544 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010545 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010546 goto overflow;
10547 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010549 if (ch == '\n' || ch == '\r')
10550 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010552 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010553 if (!found && PyUnicode_CheckExact(self)) {
10554 Py_INCREF((PyObject *) self);
10555 return (PyObject *) self;
10556 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010557
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010559 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010560 if (!u)
10561 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010562 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563
Antoine Pitroue71d5742011-10-04 15:55:09 +020010564 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010565
Antoine Pitroue71d5742011-10-04 15:55:09 +020010566 for (; i < src_len; i++) {
10567 ch = PyUnicode_READ(kind, src_data, i);
10568 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010569 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010570 incr = tabsize - (line_pos % tabsize);
10571 line_pos += incr;
10572 while (incr--) {
10573 PyUnicode_WRITE(kind, dest_data, j, ' ');
10574 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010575 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010576 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010577 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010578 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010579 line_pos++;
10580 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010581 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010582 if (ch == '\n' || ch == '\r')
10583 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010585 }
10586 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010587#ifndef DONT_MAKE_RESULT_READY
10588 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 Py_DECREF(u);
10590 return NULL;
10591 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010592#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010593 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010595
Antoine Pitroue71d5742011-10-04 15:55:09 +020010596 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010597 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599}
10600
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010601PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010602 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603\n\
10604Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010605such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606arguments start and end are interpreted as in slice notation.\n\
10607\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010608Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609
10610static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612{
Jesus Ceaac451502011-04-20 17:09:23 +020010613 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010614 Py_ssize_t start;
10615 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617
Jesus Ceaac451502011-04-20 17:09:23 +020010618 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10619 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 if (PyUnicode_READY(self) == -1)
10623 return NULL;
10624 if (PyUnicode_READY(substring) == -1)
10625 return NULL;
10626
Victor Stinner794d5672011-10-10 03:21:36 +020010627 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010629 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630
10631 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 if (result == -2)
10634 return NULL;
10635
Christian Heimes217cfd12007-12-02 14:31:20 +000010636 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637}
10638
10639static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010640unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010642 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10643 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010646}
10647
Guido van Rossumc2504932007-09-18 19:42:40 +000010648/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010649 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010650static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010651unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010652{
Guido van Rossumc2504932007-09-18 19:42:40 +000010653 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010654 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 if (_PyUnicode_HASH(self) != -1)
10657 return _PyUnicode_HASH(self);
10658 if (PyUnicode_READY(self) == -1)
10659 return -1;
10660 len = PyUnicode_GET_LENGTH(self);
10661
10662 /* The hash function as a macro, gets expanded three times below. */
10663#define HASH(P) \
10664 x = (Py_uhash_t)*P << 7; \
10665 while (--len >= 0) \
10666 x = (1000003*x) ^ (Py_uhash_t)*P++;
10667
10668 switch (PyUnicode_KIND(self)) {
10669 case PyUnicode_1BYTE_KIND: {
10670 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10671 HASH(c);
10672 break;
10673 }
10674 case PyUnicode_2BYTE_KIND: {
10675 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10676 HASH(s);
10677 break;
10678 }
10679 default: {
10680 Py_UCS4 *l;
10681 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10682 "Impossible switch case in unicode_hash");
10683 l = PyUnicode_4BYTE_DATA(self);
10684 HASH(l);
10685 break;
10686 }
10687 }
10688 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10689
Guido van Rossumc2504932007-09-18 19:42:40 +000010690 if (x == -1)
10691 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010693 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010694}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010697PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010698 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010700Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010701
10702static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010705 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010706 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010707 Py_ssize_t start;
10708 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709
Jesus Ceaac451502011-04-20 17:09:23 +020010710 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10711 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (PyUnicode_READY(self) == -1)
10715 return NULL;
10716 if (PyUnicode_READY(substring) == -1)
10717 return NULL;
10718
Victor Stinner794d5672011-10-10 03:21:36 +020010719 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010721 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010722
10723 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (result == -2)
10726 return NULL;
10727
Guido van Rossumd57fd912000-03-10 22:53:23 +000010728 if (result < 0) {
10729 PyErr_SetString(PyExc_ValueError, "substring not found");
10730 return NULL;
10731 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732
Christian Heimes217cfd12007-12-02 14:31:20 +000010733 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010734}
10735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010736PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010737 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010738\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010739Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010740at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
10742static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010743unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 Py_ssize_t i, length;
10746 int kind;
10747 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010748 int cased;
10749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 if (PyUnicode_READY(self) == -1)
10751 return NULL;
10752 length = PyUnicode_GET_LENGTH(self);
10753 kind = PyUnicode_KIND(self);
10754 data = PyUnicode_DATA(self);
10755
Guido van Rossumd57fd912000-03-10 22:53:23 +000010756 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 if (length == 1)
10758 return PyBool_FromLong(
10759 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010760
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010761 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010762 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010763 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010764
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010766 for (i = 0; i < length; i++) {
10767 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010768
Benjamin Peterson29060642009-01-31 22:14:21 +000010769 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10770 return PyBool_FromLong(0);
10771 else if (!cased && Py_UNICODE_ISLOWER(ch))
10772 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010774 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775}
10776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010777PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010778 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010780Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010781at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782
10783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010784unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 Py_ssize_t i, length;
10787 int kind;
10788 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789 int cased;
10790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 if (PyUnicode_READY(self) == -1)
10792 return NULL;
10793 length = PyUnicode_GET_LENGTH(self);
10794 kind = PyUnicode_KIND(self);
10795 data = PyUnicode_DATA(self);
10796
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 if (length == 1)
10799 return PyBool_FromLong(
10800 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010801
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010802 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010803 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010804 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010805
Guido van Rossumd57fd912000-03-10 22:53:23 +000010806 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 for (i = 0; i < length; i++) {
10808 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010809
Benjamin Peterson29060642009-01-31 22:14:21 +000010810 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10811 return PyBool_FromLong(0);
10812 else if (!cased && Py_UNICODE_ISUPPER(ch))
10813 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010814 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010815 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010816}
10817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010818PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010819 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010820\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010821Return True if S is a titlecased string and there is at least one\n\
10822character in S, i.e. upper- and titlecase characters may only\n\
10823follow uncased characters and lowercase characters only cased ones.\n\
10824Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010825
10826static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010827unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 Py_ssize_t i, length;
10830 int kind;
10831 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010832 int cased, previous_is_cased;
10833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010834 if (PyUnicode_READY(self) == -1)
10835 return NULL;
10836 length = PyUnicode_GET_LENGTH(self);
10837 kind = PyUnicode_KIND(self);
10838 data = PyUnicode_DATA(self);
10839
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010841 if (length == 1) {
10842 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10843 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10844 (Py_UNICODE_ISUPPER(ch) != 0));
10845 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010847 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010848 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010850
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851 cased = 0;
10852 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853 for (i = 0; i < length; i++) {
10854 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010855
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10857 if (previous_is_cased)
10858 return PyBool_FromLong(0);
10859 previous_is_cased = 1;
10860 cased = 1;
10861 }
10862 else if (Py_UNICODE_ISLOWER(ch)) {
10863 if (!previous_is_cased)
10864 return PyBool_FromLong(0);
10865 previous_is_cased = 1;
10866 cased = 1;
10867 }
10868 else
10869 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010870 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010871 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872}
10873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010874PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010875 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010877Return True if all characters in S are whitespace\n\
10878and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879
10880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010881unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 Py_ssize_t i, length;
10884 int kind;
10885 void *data;
10886
10887 if (PyUnicode_READY(self) == -1)
10888 return NULL;
10889 length = PyUnicode_GET_LENGTH(self);
10890 kind = PyUnicode_KIND(self);
10891 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010892
Guido van Rossumd57fd912000-03-10 22:53:23 +000010893 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (length == 1)
10895 return PyBool_FromLong(
10896 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010898 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010899 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010900 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010901
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 for (i = 0; i < length; i++) {
10903 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010904 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010907 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010908}
10909
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010910PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010911 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010912\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010913Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010914and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010915
10916static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010917unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010918{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 Py_ssize_t i, length;
10920 int kind;
10921 void *data;
10922
10923 if (PyUnicode_READY(self) == -1)
10924 return NULL;
10925 length = PyUnicode_GET_LENGTH(self);
10926 kind = PyUnicode_KIND(self);
10927 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010928
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010929 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010930 if (length == 1)
10931 return PyBool_FromLong(
10932 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010933
10934 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010935 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 for (i = 0; i < length; i++) {
10939 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010940 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010941 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010942 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010943}
10944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010945PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010946 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010947\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010948Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010949and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010950
10951static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010952unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 int kind;
10955 void *data;
10956 Py_ssize_t len, i;
10957
10958 if (PyUnicode_READY(self) == -1)
10959 return NULL;
10960
10961 kind = PyUnicode_KIND(self);
10962 data = PyUnicode_DATA(self);
10963 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010964
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010965 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (len == 1) {
10967 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10968 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10969 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010970
10971 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 for (i = 0; i < len; i++) {
10976 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010977 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010978 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010979 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010980 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010981}
10982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010983PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010984 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010986Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010987False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988
10989static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010990unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 Py_ssize_t i, length;
10993 int kind;
10994 void *data;
10995
10996 if (PyUnicode_READY(self) == -1)
10997 return NULL;
10998 length = PyUnicode_GET_LENGTH(self);
10999 kind = PyUnicode_KIND(self);
11000 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (length == 1)
11004 return PyBool_FromLong(
11005 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011007 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011009 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 for (i = 0; i < length; i++) {
11012 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011013 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011014 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011015 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016}
11017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011018PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011019 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011020\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011021Return True if all characters in S are digits\n\
11022and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
11024static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011025unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 Py_ssize_t i, length;
11028 int kind;
11029 void *data;
11030
11031 if (PyUnicode_READY(self) == -1)
11032 return NULL;
11033 length = PyUnicode_GET_LENGTH(self);
11034 kind = PyUnicode_KIND(self);
11035 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 if (length == 1) {
11039 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11040 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011043 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011045 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 for (i = 0; i < length; i++) {
11048 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011051 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052}
11053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011054PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011055 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011057Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011058False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059
11060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011061unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011063 Py_ssize_t i, length;
11064 int kind;
11065 void *data;
11066
11067 if (PyUnicode_READY(self) == -1)
11068 return NULL;
11069 length = PyUnicode_GET_LENGTH(self);
11070 kind = PyUnicode_KIND(self);
11071 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (length == 1)
11075 return PyBool_FromLong(
11076 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011078 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 for (i = 0; i < length; i++) {
11083 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011084 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011086 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087}
11088
Martin v. Löwis47383402007-08-15 07:32:56 +000011089int
11090PyUnicode_IsIdentifier(PyObject *self)
11091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 int kind;
11093 void *data;
11094 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011095 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (PyUnicode_READY(self) == -1) {
11098 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011099 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 }
11101
11102 /* Special case for empty strings */
11103 if (PyUnicode_GET_LENGTH(self) == 0)
11104 return 0;
11105 kind = PyUnicode_KIND(self);
11106 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011107
11108 /* PEP 3131 says that the first character must be in
11109 XID_Start and subsequent characters in XID_Continue,
11110 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011111 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011112 letters, digits, underscore). However, given the current
11113 definition of XID_Start and XID_Continue, it is sufficient
11114 to check just for these, except that _ must be allowed
11115 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011116 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011117 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011118 return 0;
11119
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011120 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011123 return 1;
11124}
11125
11126PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011127 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011128\n\
11129Return True if S is a valid identifier according\n\
11130to the language definition.");
11131
11132static PyObject*
11133unicode_isidentifier(PyObject *self)
11134{
11135 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11136}
11137
Georg Brandl559e5d72008-06-11 18:37:52 +000011138PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011139 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011140\n\
11141Return True if all characters in S are considered\n\
11142printable in repr() or S is empty, False otherwise.");
11143
11144static PyObject*
11145unicode_isprintable(PyObject *self)
11146{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 Py_ssize_t i, length;
11148 int kind;
11149 void *data;
11150
11151 if (PyUnicode_READY(self) == -1)
11152 return NULL;
11153 length = PyUnicode_GET_LENGTH(self);
11154 kind = PyUnicode_KIND(self);
11155 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011156
11157 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (length == 1)
11159 return PyBool_FromLong(
11160 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011161
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 for (i = 0; i < length; i++) {
11163 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011164 Py_RETURN_FALSE;
11165 }
11166 }
11167 Py_RETURN_TRUE;
11168}
11169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011170PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011171 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011172\n\
11173Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011174iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175
11176static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011177unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011179 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180}
11181
Martin v. Löwis18e16552006-02-15 17:27:45 +000011182static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183unicode_length(PyUnicodeObject *self)
11184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011185 if (PyUnicode_READY(self) == -1)
11186 return -1;
11187 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188}
11189
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011190PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011191 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011193Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011194done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195
11196static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011197unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011199 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011200 Py_UCS4 fillchar = ' ';
11201
11202 if (PyUnicode_READY(self) == -1)
11203 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011204
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011205 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 return NULL;
11207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 Py_INCREF(self);
11210 return (PyObject*) self;
11211 }
11212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214}
11215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011216PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011219Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220
11221static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011222unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224 return fixup(self, fixlower);
11225}
11226
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011227#define LEFTSTRIP 0
11228#define RIGHTSTRIP 1
11229#define BOTHSTRIP 2
11230
11231/* Arrays indexed by above */
11232static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11233
11234#define STRIPNAME(i) (stripformat[i]+3)
11235
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011236/* externally visible for str.strip(unicode) */
11237PyObject *
11238_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 void *data;
11241 int kind;
11242 Py_ssize_t i, j, len;
11243 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11246 return NULL;
11247
11248 kind = PyUnicode_KIND(self);
11249 data = PyUnicode_DATA(self);
11250 len = PyUnicode_GET_LENGTH(self);
11251 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11252 PyUnicode_DATA(sepobj),
11253 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011254
Benjamin Peterson14339b62009-01-31 16:36:08 +000011255 i = 0;
11256 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 while (i < len &&
11258 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011259 i++;
11260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011261 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011262
Benjamin Peterson14339b62009-01-31 16:36:08 +000011263 j = len;
11264 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011265 do {
11266 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 } while (j >= i &&
11268 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011270 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011271
Victor Stinner12bab6d2011-10-01 01:53:49 +020011272 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011273}
11274
11275PyObject*
11276PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11277{
11278 unsigned char *data;
11279 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011280 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011281
Victor Stinnerde636f32011-10-01 03:55:54 +020011282 if (PyUnicode_READY(self) == -1)
11283 return NULL;
11284
11285 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11286
Victor Stinner12bab6d2011-10-01 01:53:49 +020011287 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011288 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011289 if (PyUnicode_CheckExact(self)) {
11290 Py_INCREF(self);
11291 return self;
11292 }
11293 else
11294 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011295 }
11296
Victor Stinner12bab6d2011-10-01 01:53:49 +020011297 length = end - start;
11298 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011299 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300
Victor Stinnerde636f32011-10-01 03:55:54 +020011301 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011302 PyErr_SetString(PyExc_IndexError, "string index out of range");
11303 return NULL;
11304 }
11305
Victor Stinnerb9275c12011-10-05 14:01:42 +020011306 if (PyUnicode_IS_ASCII(self)) {
11307 kind = PyUnicode_KIND(self);
11308 data = PyUnicode_1BYTE_DATA(self);
11309 return unicode_fromascii(data + start, length);
11310 }
11311 else {
11312 kind = PyUnicode_KIND(self);
11313 data = PyUnicode_1BYTE_DATA(self);
11314 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011315 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011316 length);
11317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
11320static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011321do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 int kind;
11324 void *data;
11325 Py_ssize_t len, i, j;
11326
11327 if (PyUnicode_READY(self) == -1)
11328 return NULL;
11329
11330 kind = PyUnicode_KIND(self);
11331 data = PyUnicode_DATA(self);
11332 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011333
Benjamin Peterson14339b62009-01-31 16:36:08 +000011334 i = 0;
11335 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011336 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011337 i++;
11338 }
11339 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011340
Benjamin Peterson14339b62009-01-31 16:36:08 +000011341 j = len;
11342 if (striptype != LEFTSTRIP) {
11343 do {
11344 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011346 j++;
11347 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011348
Victor Stinner12bab6d2011-10-01 01:53:49 +020011349 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350}
11351
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011352
11353static PyObject *
11354do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11355{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011356 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011357
Benjamin Peterson14339b62009-01-31 16:36:08 +000011358 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11359 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011360
Benjamin Peterson14339b62009-01-31 16:36:08 +000011361 if (sep != NULL && sep != Py_None) {
11362 if (PyUnicode_Check(sep))
11363 return _PyUnicode_XStrip(self, striptype, sep);
11364 else {
11365 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011366 "%s arg must be None or str",
11367 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011368 return NULL;
11369 }
11370 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011371
Benjamin Peterson14339b62009-01-31 16:36:08 +000011372 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011373}
11374
11375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011376PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011377 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011378\n\
11379Return a copy of the string S with leading and trailing\n\
11380whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011381If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011382
11383static PyObject *
11384unicode_strip(PyUnicodeObject *self, PyObject *args)
11385{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011386 if (PyTuple_GET_SIZE(args) == 0)
11387 return do_strip(self, BOTHSTRIP); /* Common case */
11388 else
11389 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011390}
11391
11392
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011393PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011395\n\
11396Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011397If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011398
11399static PyObject *
11400unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11401{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011402 if (PyTuple_GET_SIZE(args) == 0)
11403 return do_strip(self, LEFTSTRIP); /* Common case */
11404 else
11405 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011406}
11407
11408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011409PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011410 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011411\n\
11412Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011413If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011414
11415static PyObject *
11416unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11417{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011418 if (PyTuple_GET_SIZE(args) == 0)
11419 return do_strip(self, RIGHTSTRIP); /* Common case */
11420 else
11421 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011422}
11423
11424
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011426unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
11428 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
Georg Brandl222de0f2009-04-12 12:01:50 +000011431 if (len < 1) {
11432 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011433 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
Tim Peters7a29bd52001-09-12 03:03:31 +000011436 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 /* no repeat, return original string */
11438 Py_INCREF(str);
11439 return (PyObject*) str;
11440 }
Tim Peters8f422462000-09-09 06:13:41 +000011441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011442 if (PyUnicode_READY(str) == -1)
11443 return NULL;
11444
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011445 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011446 PyErr_SetString(PyExc_OverflowError,
11447 "repeated string is too long");
11448 return NULL;
11449 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453 if (!u)
11454 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011455 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (PyUnicode_GET_LENGTH(str) == 1) {
11458 const int kind = PyUnicode_KIND(str);
11459 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11460 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011461 if (kind == PyUnicode_1BYTE_KIND)
11462 memset(to, (unsigned char)fill_char, len);
11463 else {
11464 for (n = 0; n < len; ++n)
11465 PyUnicode_WRITE(kind, to, n, fill_char);
11466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011467 }
11468 else {
11469 /* number of characters copied this far */
11470 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011471 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 char *to = (char *) PyUnicode_DATA(u);
11473 Py_MEMCPY(to, PyUnicode_DATA(str),
11474 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011476 n = (done <= nchars-done) ? done : nchars-done;
11477 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011478 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 }
11481
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011482 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 return (PyObject*) u;
11484}
11485
Alexander Belopolsky40018472011-02-26 01:02:56 +000011486PyObject *
11487PyUnicode_Replace(PyObject *obj,
11488 PyObject *subobj,
11489 PyObject *replobj,
11490 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491{
11492 PyObject *self;
11493 PyObject *str1;
11494 PyObject *str2;
11495 PyObject *result;
11496
11497 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011498 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011501 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 Py_DECREF(self);
11503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 }
11505 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011506 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 Py_DECREF(self);
11508 Py_DECREF(str1);
11509 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011511 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512 Py_DECREF(self);
11513 Py_DECREF(str1);
11514 Py_DECREF(str2);
11515 return result;
11516}
11517
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011518PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011519 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520\n\
11521Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011522old replaced by new. If the optional argument count is\n\
11523given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
11525static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 PyObject *str1;
11529 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011530 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 PyObject *result;
11532
Martin v. Löwis18e16552006-02-15 17:27:45 +000011533 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011536 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 str1 = PyUnicode_FromObject(str1);
11538 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11539 return NULL;
11540 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011541 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 Py_DECREF(str1);
11543 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545
11546 result = replace(self, str1, str2, maxcount);
11547
11548 Py_DECREF(str1);
11549 Py_DECREF(str2);
11550 return result;
11551}
11552
Alexander Belopolsky40018472011-02-26 01:02:56 +000011553static PyObject *
11554unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011555{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011556 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 Py_ssize_t isize;
11558 Py_ssize_t osize, squote, dquote, i, o;
11559 Py_UCS4 max, quote;
11560 int ikind, okind;
11561 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011563 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011564 return NULL;
11565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 isize = PyUnicode_GET_LENGTH(unicode);
11567 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 /* Compute length of output, quote characters, and
11570 maximum character */
11571 osize = 2; /* quotes */
11572 max = 127;
11573 squote = dquote = 0;
11574 ikind = PyUnicode_KIND(unicode);
11575 for (i = 0; i < isize; i++) {
11576 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11577 switch (ch) {
11578 case '\'': squote++; osize++; break;
11579 case '"': dquote++; osize++; break;
11580 case '\\': case '\t': case '\r': case '\n':
11581 osize += 2; break;
11582 default:
11583 /* Fast-path ASCII */
11584 if (ch < ' ' || ch == 0x7f)
11585 osize += 4; /* \xHH */
11586 else if (ch < 0x7f)
11587 osize++;
11588 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11589 osize++;
11590 max = ch > max ? ch : max;
11591 }
11592 else if (ch < 0x100)
11593 osize += 4; /* \xHH */
11594 else if (ch < 0x10000)
11595 osize += 6; /* \uHHHH */
11596 else
11597 osize += 10; /* \uHHHHHHHH */
11598 }
11599 }
11600
11601 quote = '\'';
11602 if (squote) {
11603 if (dquote)
11604 /* Both squote and dquote present. Use squote,
11605 and escape them */
11606 osize += squote;
11607 else
11608 quote = '"';
11609 }
11610
11611 repr = PyUnicode_New(osize, max);
11612 if (repr == NULL)
11613 return NULL;
11614 okind = PyUnicode_KIND(repr);
11615 odata = PyUnicode_DATA(repr);
11616
11617 PyUnicode_WRITE(okind, odata, 0, quote);
11618 PyUnicode_WRITE(okind, odata, osize-1, quote);
11619
11620 for (i = 0, o = 1; i < isize; i++) {
11621 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011622
11623 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if ((ch == quote) || (ch == '\\')) {
11625 PyUnicode_WRITE(okind, odata, o++, '\\');
11626 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011627 continue;
11628 }
11629
Benjamin Peterson29060642009-01-31 22:14:21 +000011630 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011631 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 PyUnicode_WRITE(okind, odata, o++, '\\');
11633 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011634 }
11635 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011636 PyUnicode_WRITE(okind, odata, o++, '\\');
11637 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011638 }
11639 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011640 PyUnicode_WRITE(okind, odata, o++, '\\');
11641 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011642 }
11643
11644 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011645 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 PyUnicode_WRITE(okind, odata, o++, '\\');
11647 PyUnicode_WRITE(okind, odata, o++, 'x');
11648 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11649 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011650 }
11651
Georg Brandl559e5d72008-06-11 18:37:52 +000011652 /* Copy ASCII characters as-is */
11653 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011654 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011655 }
11656
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011658 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011659 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011660 (categories Z* and C* except ASCII space)
11661 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011663 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 if (ch <= 0xff) {
11665 PyUnicode_WRITE(okind, odata, o++, '\\');
11666 PyUnicode_WRITE(okind, odata, o++, 'x');
11667 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11668 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011669 }
11670 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 else if (ch >= 0x10000) {
11672 PyUnicode_WRITE(okind, odata, o++, '\\');
11673 PyUnicode_WRITE(okind, odata, o++, 'U');
11674 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11675 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11676 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11677 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11678 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11679 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11680 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11681 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011682 }
11683 /* Map 16-bit characters to '\uxxxx' */
11684 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 PyUnicode_WRITE(okind, odata, o++, '\\');
11686 PyUnicode_WRITE(okind, odata, o++, 'u');
11687 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11688 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11689 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11690 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011691 }
11692 }
11693 /* Copy characters as-is */
11694 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011696 }
11697 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011698 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011700 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011701 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702}
11703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011704PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011705 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706\n\
11707Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011708such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709arguments start and end are interpreted as in slice notation.\n\
11710\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011711Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
11713static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715{
Jesus Ceaac451502011-04-20 17:09:23 +020011716 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011717 Py_ssize_t start;
11718 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011719 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720
Jesus Ceaac451502011-04-20 17:09:23 +020011721 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11722 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011723 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 if (PyUnicode_READY(self) == -1)
11726 return NULL;
11727 if (PyUnicode_READY(substring) == -1)
11728 return NULL;
11729
Victor Stinner794d5672011-10-10 03:21:36 +020011730 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011732 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
11734 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011736 if (result == -2)
11737 return NULL;
11738
Christian Heimes217cfd12007-12-02 14:31:20 +000011739 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740}
11741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011742PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011743 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011745Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011746
11747static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749{
Jesus Ceaac451502011-04-20 17:09:23 +020011750 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011751 Py_ssize_t start;
11752 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011753 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
Jesus Ceaac451502011-04-20 17:09:23 +020011755 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11756 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011757 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 if (PyUnicode_READY(self) == -1)
11760 return NULL;
11761 if (PyUnicode_READY(substring) == -1)
11762 return NULL;
11763
Victor Stinner794d5672011-10-10 03:21:36 +020011764 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011766 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
11768 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011769
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (result == -2)
11771 return NULL;
11772
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773 if (result < 0) {
11774 PyErr_SetString(PyExc_ValueError, "substring not found");
11775 return NULL;
11776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777
Christian Heimes217cfd12007-12-02 14:31:20 +000011778 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779}
11780
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011781PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011784Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011785done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
11787static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011788unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011790 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 Py_UCS4 fillchar = ' ';
11792
Victor Stinnere9a29352011-10-01 02:14:59 +020011793 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011795
Victor Stinnere9a29352011-10-01 02:14:59 +020011796 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 return NULL;
11798
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 Py_INCREF(self);
11801 return (PyObject*) self;
11802 }
11803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805}
11806
Alexander Belopolsky40018472011-02-26 01:02:56 +000011807PyObject *
11808PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809{
11810 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011811
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 s = PyUnicode_FromObject(s);
11813 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 if (sep != NULL) {
11816 sep = PyUnicode_FromObject(sep);
11817 if (sep == NULL) {
11818 Py_DECREF(s);
11819 return NULL;
11820 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 }
11822
Victor Stinner9310abb2011-10-05 00:59:23 +020011823 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824
11825 Py_DECREF(s);
11826 Py_XDECREF(sep);
11827 return result;
11828}
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011831 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832\n\
11833Return a list of the words in S, using sep as the\n\
11834delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011835splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011836whitespace string is a separator and empty strings are\n\
11837removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011838
11839static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011840unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
11842 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011843 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844
Martin v. Löwis18e16552006-02-15 17:27:45 +000011845 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 return NULL;
11847
11848 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011850 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011851 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011854}
11855
Thomas Wouters477c8d52006-05-27 19:21:47 +000011856PyObject *
11857PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11858{
11859 PyObject* str_obj;
11860 PyObject* sep_obj;
11861 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 int kind1, kind2, kind;
11863 void *buf1 = NULL, *buf2 = NULL;
11864 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011865
11866 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011867 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011869 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011870 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011871 Py_DECREF(str_obj);
11872 return NULL;
11873 }
11874
Victor Stinner14f8f022011-10-05 20:58:25 +020011875 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011877 kind = Py_MAX(kind1, kind2);
11878 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011880 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (!buf1)
11882 goto onError;
11883 buf2 = PyUnicode_DATA(sep_obj);
11884 if (kind2 != kind)
11885 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11886 if (!buf2)
11887 goto onError;
11888 len1 = PyUnicode_GET_LENGTH(str_obj);
11889 len2 = PyUnicode_GET_LENGTH(sep_obj);
11890
Victor Stinner14f8f022011-10-05 20:58:25 +020011891 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011893 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11894 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11895 else
11896 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 break;
11898 case PyUnicode_2BYTE_KIND:
11899 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11900 break;
11901 case PyUnicode_4BYTE_KIND:
11902 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11903 break;
11904 default:
11905 assert(0);
11906 out = 0;
11907 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011908
11909 Py_DECREF(sep_obj);
11910 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (kind1 != kind)
11912 PyMem_Free(buf1);
11913 if (kind2 != kind)
11914 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011915
11916 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 onError:
11918 Py_DECREF(sep_obj);
11919 Py_DECREF(str_obj);
11920 if (kind1 != kind && buf1)
11921 PyMem_Free(buf1);
11922 if (kind2 != kind && buf2)
11923 PyMem_Free(buf2);
11924 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011925}
11926
11927
11928PyObject *
11929PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11930{
11931 PyObject* str_obj;
11932 PyObject* sep_obj;
11933 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 int kind1, kind2, kind;
11935 void *buf1 = NULL, *buf2 = NULL;
11936 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011937
11938 str_obj = PyUnicode_FromObject(str_in);
11939 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011940 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011941 sep_obj = PyUnicode_FromObject(sep_in);
11942 if (!sep_obj) {
11943 Py_DECREF(str_obj);
11944 return NULL;
11945 }
11946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 kind1 = PyUnicode_KIND(str_in);
11948 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011949 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 buf1 = PyUnicode_DATA(str_in);
11951 if (kind1 != kind)
11952 buf1 = _PyUnicode_AsKind(str_in, kind);
11953 if (!buf1)
11954 goto onError;
11955 buf2 = PyUnicode_DATA(sep_obj);
11956 if (kind2 != kind)
11957 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11958 if (!buf2)
11959 goto onError;
11960 len1 = PyUnicode_GET_LENGTH(str_obj);
11961 len2 = PyUnicode_GET_LENGTH(sep_obj);
11962
11963 switch(PyUnicode_KIND(str_in)) {
11964 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011965 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11966 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11967 else
11968 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 break;
11970 case PyUnicode_2BYTE_KIND:
11971 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11972 break;
11973 case PyUnicode_4BYTE_KIND:
11974 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11975 break;
11976 default:
11977 assert(0);
11978 out = 0;
11979 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011980
11981 Py_DECREF(sep_obj);
11982 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (kind1 != kind)
11984 PyMem_Free(buf1);
11985 if (kind2 != kind)
11986 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011987
11988 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 onError:
11990 Py_DECREF(sep_obj);
11991 Py_DECREF(str_obj);
11992 if (kind1 != kind && buf1)
11993 PyMem_Free(buf1);
11994 if (kind2 != kind && buf2)
11995 PyMem_Free(buf2);
11996 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011997}
11998
11999PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012001\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012002Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012003the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012004found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012005
12006static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012007unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012008{
Victor Stinner9310abb2011-10-05 00:59:23 +020012009 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012010}
12011
12012PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012013 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012014\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012015Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012016the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012017separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012018
12019static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012020unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012021{
Victor Stinner9310abb2011-10-05 00:59:23 +020012022 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012023}
12024
Alexander Belopolsky40018472011-02-26 01:02:56 +000012025PyObject *
12026PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012027{
12028 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012030 s = PyUnicode_FromObject(s);
12031 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012033 if (sep != NULL) {
12034 sep = PyUnicode_FromObject(sep);
12035 if (sep == NULL) {
12036 Py_DECREF(s);
12037 return NULL;
12038 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012039 }
12040
Victor Stinner9310abb2011-10-05 00:59:23 +020012041 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012042
12043 Py_DECREF(s);
12044 Py_XDECREF(sep);
12045 return result;
12046}
12047
12048PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012049 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012050\n\
12051Return a list of the words in S, using sep as the\n\
12052delimiter string, starting at the end of the string and\n\
12053working to the front. If maxsplit is given, at most maxsplit\n\
12054splits are done. If sep is not specified, any whitespace string\n\
12055is a separator.");
12056
12057static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012058unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012059{
12060 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012061 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012062
Martin v. Löwis18e16552006-02-15 17:27:45 +000012063 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012064 return NULL;
12065
12066 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012067 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012068 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012069 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012070 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012071 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012072}
12073
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012074PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012075 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076\n\
12077Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012078Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012079is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
12081static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012082unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012084 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012085 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012087 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12088 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 return NULL;
12090
Guido van Rossum86662912000-04-11 15:38:46 +000012091 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092}
12093
12094static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012095PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096{
Walter Dörwald346737f2007-05-31 10:44:43 +000012097 if (PyUnicode_CheckExact(self)) {
12098 Py_INCREF(self);
12099 return self;
12100 } else
12101 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012102 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103}
12104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012105PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012106 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107\n\
12108Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012109and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
12111static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012112unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114 return fixup(self, fixswapcase);
12115}
12116
Georg Brandlceee0772007-11-27 23:48:05 +000012117PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012118 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012119\n\
12120Return a translation table usable for str.translate().\n\
12121If there is only one argument, it must be a dictionary mapping Unicode\n\
12122ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012123Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012124If there are two arguments, they must be strings of equal length, and\n\
12125in the resulting dictionary, each character in x will be mapped to the\n\
12126character at the same position in y. If there is a third argument, it\n\
12127must be a string, whose characters will be mapped to None in the result.");
12128
12129static PyObject*
12130unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12131{
12132 PyObject *x, *y = NULL, *z = NULL;
12133 PyObject *new = NULL, *key, *value;
12134 Py_ssize_t i = 0;
12135 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012136
Georg Brandlceee0772007-11-27 23:48:05 +000012137 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12138 return NULL;
12139 new = PyDict_New();
12140 if (!new)
12141 return NULL;
12142 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 int x_kind, y_kind, z_kind;
12144 void *x_data, *y_data, *z_data;
12145
Georg Brandlceee0772007-11-27 23:48:05 +000012146 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012147 if (!PyUnicode_Check(x)) {
12148 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12149 "be a string if there is a second argument");
12150 goto err;
12151 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012153 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12154 "arguments must have equal length");
12155 goto err;
12156 }
12157 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 x_kind = PyUnicode_KIND(x);
12159 y_kind = PyUnicode_KIND(y);
12160 x_data = PyUnicode_DATA(x);
12161 y_data = PyUnicode_DATA(y);
12162 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12163 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12164 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012165 if (!key || !value)
12166 goto err;
12167 res = PyDict_SetItem(new, key, value);
12168 Py_DECREF(key);
12169 Py_DECREF(value);
12170 if (res < 0)
12171 goto err;
12172 }
12173 /* create entries for deleting chars in z */
12174 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012175 z_kind = PyUnicode_KIND(z);
12176 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012177 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012179 if (!key)
12180 goto err;
12181 res = PyDict_SetItem(new, key, Py_None);
12182 Py_DECREF(key);
12183 if (res < 0)
12184 goto err;
12185 }
12186 }
12187 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 int kind;
12189 void *data;
12190
Georg Brandlceee0772007-11-27 23:48:05 +000012191 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012192 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012193 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12194 "to maketrans it must be a dict");
12195 goto err;
12196 }
12197 /* copy entries into the new dict, converting string keys to int keys */
12198 while (PyDict_Next(x, &i, &key, &value)) {
12199 if (PyUnicode_Check(key)) {
12200 /* convert string keys to integer keys */
12201 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012202 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012203 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12204 "table must be of length 1");
12205 goto err;
12206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 kind = PyUnicode_KIND(key);
12208 data = PyUnicode_DATA(key);
12209 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012210 if (!newkey)
12211 goto err;
12212 res = PyDict_SetItem(new, newkey, value);
12213 Py_DECREF(newkey);
12214 if (res < 0)
12215 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012216 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012217 /* just keep integer keys */
12218 if (PyDict_SetItem(new, key, value) < 0)
12219 goto err;
12220 } else {
12221 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12222 "be strings or integers");
12223 goto err;
12224 }
12225 }
12226 }
12227 return new;
12228 err:
12229 Py_DECREF(new);
12230 return NULL;
12231}
12232
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012233PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235\n\
12236Return a copy of the string S, where all characters have been mapped\n\
12237through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012238Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012239Unmapped characters are left untouched. Characters mapped to None\n\
12240are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241
12242static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246}
12247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012248PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012251Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252
12253static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012254unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 return fixup(self, fixupper);
12257}
12258
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012259PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012260 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012262Pad a numeric string S with zeros on the left, to fill a field\n\
12263of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264
12265static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012266unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012268 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012269 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012270 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012271 int kind;
12272 void *data;
12273 Py_UCS4 chr;
12274
12275 if (PyUnicode_READY(self) == -1)
12276 return NULL;
12277
Martin v. Löwis18e16552006-02-15 17:27:45 +000012278 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 return NULL;
12280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012282 if (PyUnicode_CheckExact(self)) {
12283 Py_INCREF(self);
12284 return (PyObject*) self;
12285 }
12286 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012287 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 }
12289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291
12292 u = pad(self, fill, 0, '0');
12293
Walter Dörwald068325e2002-04-15 13:36:47 +000012294 if (u == NULL)
12295 return NULL;
12296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 kind = PyUnicode_KIND(u);
12298 data = PyUnicode_DATA(u);
12299 chr = PyUnicode_READ(kind, data, fill);
12300
12301 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 PyUnicode_WRITE(kind, data, 0, chr);
12304 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 }
12306
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012307 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 return (PyObject*) u;
12309}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310
12311#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012312static PyObject *
12313unicode__decimal2ascii(PyObject *self)
12314{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012316}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317#endif
12318
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012319PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012321\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012322Return True if S starts with the specified prefix, False otherwise.\n\
12323With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012324With optional end, stop comparing S at that position.\n\
12325prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012326
12327static PyObject *
12328unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012330{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012331 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012332 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012333 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012334 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012335 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
Jesus Ceaac451502011-04-20 17:09:23 +020012337 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012339 if (PyTuple_Check(subobj)) {
12340 Py_ssize_t i;
12341 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12342 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012344 if (substring == NULL)
12345 return NULL;
12346 result = tailmatch(self, substring, start, end, -1);
12347 Py_DECREF(substring);
12348 if (result) {
12349 Py_RETURN_TRUE;
12350 }
12351 }
12352 /* nothing matched */
12353 Py_RETURN_FALSE;
12354 }
12355 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012356 if (substring == NULL) {
12357 if (PyErr_ExceptionMatches(PyExc_TypeError))
12358 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12359 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012360 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012361 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012362 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012364 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
12367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012368PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012369 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012371Return True if S ends with the specified suffix, False otherwise.\n\
12372With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012373With optional end, stop comparing S at that position.\n\
12374suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012375
12376static PyObject *
12377unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012378 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012380 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012382 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012383 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012384 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385
Jesus Ceaac451502011-04-20 17:09:23 +020012386 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012387 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012388 if (PyTuple_Check(subobj)) {
12389 Py_ssize_t i;
12390 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12391 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012392 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012393 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012394 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012395 result = tailmatch(self, substring, start, end, +1);
12396 Py_DECREF(substring);
12397 if (result) {
12398 Py_RETURN_TRUE;
12399 }
12400 }
12401 Py_RETURN_FALSE;
12402 }
12403 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012404 if (substring == NULL) {
12405 if (PyErr_ExceptionMatches(PyExc_TypeError))
12406 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12407 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012408 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012409 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012410 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012412 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012413}
12414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012416
12417PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012419\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012420Return a formatted version of S, using substitutions from args and kwargs.\n\
12421The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012422
Eric Smith27bbca62010-11-04 17:06:58 +000012423PyDoc_STRVAR(format_map__doc__,
12424 "S.format_map(mapping) -> str\n\
12425\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012426Return a formatted version of S, using substitutions from mapping.\n\
12427The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012428
Eric Smith4a7d76d2008-05-30 18:10:19 +000012429static PyObject *
12430unicode__format__(PyObject* self, PyObject* args)
12431{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012432 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012433
12434 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12435 return NULL;
12436
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012437 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012439 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012440}
12441
Eric Smith8c663262007-08-25 02:26:07 +000012442PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012443 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012444\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012445Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012446
12447static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012448unicode__sizeof__(PyUnicodeObject *v)
12449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 Py_ssize_t size;
12451
12452 /* If it's a compact object, account for base structure +
12453 character data. */
12454 if (PyUnicode_IS_COMPACT_ASCII(v))
12455 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12456 else if (PyUnicode_IS_COMPACT(v))
12457 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012458 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012459 else {
12460 /* If it is a two-block object, account for base object, and
12461 for character block if present. */
12462 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012463 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012464 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012465 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 }
12467 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012468 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012469 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012471 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012472 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473
12474 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012475}
12476
12477PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012479
12480static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012481unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012482{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012483 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012484 if (!copy)
12485 return NULL;
12486 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012487}
12488
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489static PyMethodDef unicode_methods[] = {
12490
12491 /* Order is according to common usage: often used methods should
12492 appear first, since lookup is done sequentially. */
12493
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012494 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012495 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12496 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012498 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12499 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12500 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12501 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12502 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12503 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12504 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012505 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012506 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12507 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12508 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012509 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012510 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12511 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12512 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012514 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012515 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012516 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012517 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12518 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12519 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12520 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12521 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12522 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12523 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12524 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12525 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12526 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12527 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12528 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12529 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12530 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012531 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012532 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012533 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012534 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012535 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012536 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012537 {"maketrans", (PyCFunction) unicode_maketrans,
12538 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012539 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012540#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012541 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012542#endif
12543
12544#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012545 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012546 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547#endif
12548
Benjamin Peterson14339b62009-01-31 16:36:08 +000012549 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550 {NULL, NULL}
12551};
12552
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012553static PyObject *
12554unicode_mod(PyObject *v, PyObject *w)
12555{
Brian Curtindfc80e32011-08-10 20:28:54 -050012556 if (!PyUnicode_Check(v))
12557 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012558 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012559}
12560
12561static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012562 0, /*nb_add*/
12563 0, /*nb_subtract*/
12564 0, /*nb_multiply*/
12565 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012566};
12567
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012569 (lenfunc) unicode_length, /* sq_length */
12570 PyUnicode_Concat, /* sq_concat */
12571 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12572 (ssizeargfunc) unicode_getitem, /* sq_item */
12573 0, /* sq_slice */
12574 0, /* sq_ass_item */
12575 0, /* sq_ass_slice */
12576 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577};
12578
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012579static PyObject*
12580unicode_subscript(PyUnicodeObject* self, PyObject* item)
12581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 if (PyUnicode_READY(self) == -1)
12583 return NULL;
12584
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012585 if (PyIndex_Check(item)) {
12586 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012587 if (i == -1 && PyErr_Occurred())
12588 return NULL;
12589 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012590 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012591 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012592 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012593 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012594 PyObject *result;
12595 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012596 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012597 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012598
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012599 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012600 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012601 return NULL;
12602 }
12603
12604 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 return PyUnicode_New(0, 0);
12606 } else if (start == 0 && step == 1 &&
12607 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012608 PyUnicode_CheckExact(self)) {
12609 Py_INCREF(self);
12610 return (PyObject *)self;
12611 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012612 return PyUnicode_Substring((PyObject*)self,
12613 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012614 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012615 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012616 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012617 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012618 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012619 src_data = PyUnicode_DATA(self);
12620 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12621 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012622 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012623 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012624 if (max_char >= kind_limit)
12625 break;
12626 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012627 }
12628 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012629 if (result == NULL)
12630 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012631 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012632 dest_data = PyUnicode_DATA(result);
12633
12634 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012635 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12636 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012637 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012638 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012639 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012640 } else {
12641 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12642 return NULL;
12643 }
12644}
12645
12646static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012647 (lenfunc)unicode_length, /* mp_length */
12648 (binaryfunc)unicode_subscript, /* mp_subscript */
12649 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012650};
12651
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652
Guido van Rossumd57fd912000-03-10 22:53:23 +000012653/* Helpers for PyUnicode_Format() */
12654
12655static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012656getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012657{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012658 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012660 (*p_argidx)++;
12661 if (arglen < 0)
12662 return args;
12663 else
12664 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012665 }
12666 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668 return NULL;
12669}
12670
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012671/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012673static PyObject *
12674formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012676 char *p;
12677 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012679
Guido van Rossumd57fd912000-03-10 22:53:23 +000012680 x = PyFloat_AsDouble(v);
12681 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012682 return NULL;
12683
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012686
Eric Smith0923d1d2009-04-16 20:16:10 +000012687 p = PyOS_double_to_string(x, type, prec,
12688 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012689 if (p == NULL)
12690 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012691 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012692 PyMem_Free(p);
12693 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694}
12695
Tim Peters38fd5b62000-09-21 05:43:11 +000012696static PyObject*
12697formatlong(PyObject *val, int flags, int prec, int type)
12698{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012699 char *buf;
12700 int len;
12701 PyObject *str; /* temporary string object. */
12702 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012703
Benjamin Peterson14339b62009-01-31 16:36:08 +000012704 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12705 if (!str)
12706 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012708 Py_DECREF(str);
12709 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012710}
12711
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012712static Py_UCS4
12713formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012715 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012716 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012718 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012720 goto onError;
12721 }
12722 else {
12723 /* Integer input truncated to a character */
12724 long x;
12725 x = PyLong_AsLong(v);
12726 if (x == -1 && PyErr_Occurred())
12727 goto onError;
12728
12729 if (x < 0 || x > 0x10ffff) {
12730 PyErr_SetString(PyExc_OverflowError,
12731 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012732 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 }
12734
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012735 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012737
Benjamin Peterson29060642009-01-31 22:14:21 +000012738 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012739 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012741 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012742}
12743
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012744static int
12745repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12746{
12747 int r;
12748 assert(count > 0);
12749 assert(PyUnicode_Check(obj));
12750 if (count > 5) {
12751 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12752 if (repeated == NULL)
12753 return -1;
12754 r = _PyAccu_Accumulate(acc, repeated);
12755 Py_DECREF(repeated);
12756 return r;
12757 }
12758 else {
12759 do {
12760 if (_PyAccu_Accumulate(acc, obj))
12761 return -1;
12762 } while (--count);
12763 return 0;
12764 }
12765}
12766
Alexander Belopolsky40018472011-02-26 01:02:56 +000012767PyObject *
12768PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 void *fmt;
12771 int fmtkind;
12772 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012774 int r;
12775 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012777 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012778 PyObject *temp = NULL;
12779 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012781 _PyAccu acc;
12782 static PyObject *plus, *minus, *blank, *zero, *percent;
12783
12784 if (!plus && !(plus = get_latin1_char('+')))
12785 return NULL;
12786 if (!minus && !(minus = get_latin1_char('-')))
12787 return NULL;
12788 if (!blank && !(blank = get_latin1_char(' ')))
12789 return NULL;
12790 if (!zero && !(zero = get_latin1_char('0')))
12791 return NULL;
12792 if (!percent && !(percent = get_latin1_char('%')))
12793 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012794
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 PyErr_BadInternalCall();
12797 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12800 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012802 if (_PyAccu_Init(&acc))
12803 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 fmt = PyUnicode_DATA(uformat);
12805 fmtkind = PyUnicode_KIND(uformat);
12806 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12807 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
Guido van Rossumd57fd912000-03-10 22:53:23 +000012809 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012810 arglen = PyTuple_Size(args);
12811 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812 }
12813 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012814 arglen = -1;
12815 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012817 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012818 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820
12821 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012823 PyObject *nonfmt;
12824 Py_ssize_t nonfmtpos;
12825 nonfmtpos = fmtpos++;
12826 while (fmtcnt >= 0 &&
12827 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12828 fmtpos++;
12829 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012830 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012831 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12832 if (nonfmt == NULL)
12833 goto onError;
12834 r = _PyAccu_Accumulate(&acc, nonfmt);
12835 Py_DECREF(nonfmt);
12836 if (r)
12837 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012838 }
12839 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 /* Got a format specifier */
12841 int flags = 0;
12842 Py_ssize_t width = -1;
12843 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012845 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012846 int isnumok;
12847 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012848 void *pbuf = NULL;
12849 Py_ssize_t pindex, len;
12850 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012852 fmtpos++;
12853 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12854 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012855 Py_ssize_t keylen;
12856 PyObject *key;
12857 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012858
Benjamin Peterson29060642009-01-31 22:14:21 +000012859 if (dict == NULL) {
12860 PyErr_SetString(PyExc_TypeError,
12861 "format requires a mapping");
12862 goto onError;
12863 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012864 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012867 /* Skip over balanced parentheses */
12868 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012869 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012870 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 if (fmtcnt < 0 || pcount > 0) {
12877 PyErr_SetString(PyExc_ValueError,
12878 "incomplete format key");
12879 goto onError;
12880 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012881 key = PyUnicode_Substring((PyObject*)uformat,
12882 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 if (key == NULL)
12884 goto onError;
12885 if (args_owned) {
12886 Py_DECREF(args);
12887 args_owned = 0;
12888 }
12889 args = PyObject_GetItem(dict, key);
12890 Py_DECREF(key);
12891 if (args == NULL) {
12892 goto onError;
12893 }
12894 args_owned = 1;
12895 arglen = -1;
12896 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012897 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012898 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012899 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012900 case '-': flags |= F_LJUST; continue;
12901 case '+': flags |= F_SIGN; continue;
12902 case ' ': flags |= F_BLANK; continue;
12903 case '#': flags |= F_ALT; continue;
12904 case '0': flags |= F_ZERO; continue;
12905 }
12906 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012907 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 if (c == '*') {
12909 v = getnextarg(args, arglen, &argidx);
12910 if (v == NULL)
12911 goto onError;
12912 if (!PyLong_Check(v)) {
12913 PyErr_SetString(PyExc_TypeError,
12914 "* wants int");
12915 goto onError;
12916 }
12917 width = PyLong_AsLong(v);
12918 if (width == -1 && PyErr_Occurred())
12919 goto onError;
12920 if (width < 0) {
12921 flags |= F_LJUST;
12922 width = -width;
12923 }
12924 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012926 }
12927 else if (c >= '0' && c <= '9') {
12928 width = c - '0';
12929 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 if (c < '0' || c > '9')
12932 break;
12933 if ((width*10) / 10 != width) {
12934 PyErr_SetString(PyExc_ValueError,
12935 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012936 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 }
12938 width = width*10 + (c - '0');
12939 }
12940 }
12941 if (c == '.') {
12942 prec = 0;
12943 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012944 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012945 if (c == '*') {
12946 v = getnextarg(args, arglen, &argidx);
12947 if (v == NULL)
12948 goto onError;
12949 if (!PyLong_Check(v)) {
12950 PyErr_SetString(PyExc_TypeError,
12951 "* wants int");
12952 goto onError;
12953 }
12954 prec = PyLong_AsLong(v);
12955 if (prec == -1 && PyErr_Occurred())
12956 goto onError;
12957 if (prec < 0)
12958 prec = 0;
12959 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012961 }
12962 else if (c >= '0' && c <= '9') {
12963 prec = c - '0';
12964 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 if (c < '0' || c > '9')
12967 break;
12968 if ((prec*10) / 10 != prec) {
12969 PyErr_SetString(PyExc_ValueError,
12970 "prec too big");
12971 goto onError;
12972 }
12973 prec = prec*10 + (c - '0');
12974 }
12975 }
12976 } /* prec */
12977 if (fmtcnt >= 0) {
12978 if (c == 'h' || c == 'l' || c == 'L') {
12979 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012980 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012981 }
12982 }
12983 if (fmtcnt < 0) {
12984 PyErr_SetString(PyExc_ValueError,
12985 "incomplete format");
12986 goto onError;
12987 }
12988 if (c != '%') {
12989 v = getnextarg(args, arglen, &argidx);
12990 if (v == NULL)
12991 goto onError;
12992 }
12993 sign = 0;
12994 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012995 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000012996 switch (c) {
12997
12998 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012999 _PyAccu_Accumulate(&acc, percent);
13000 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013001
13002 case 's':
13003 case 'r':
13004 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013005 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 temp = v;
13007 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013008 }
13009 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013010 if (c == 's')
13011 temp = PyObject_Str(v);
13012 else if (c == 'r')
13013 temp = PyObject_Repr(v);
13014 else
13015 temp = PyObject_ASCII(v);
13016 if (temp == NULL)
13017 goto onError;
13018 if (PyUnicode_Check(temp))
13019 /* nothing to do */;
13020 else {
13021 Py_DECREF(temp);
13022 PyErr_SetString(PyExc_TypeError,
13023 "%s argument has non-string str()");
13024 goto onError;
13025 }
13026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 if (PyUnicode_READY(temp) == -1) {
13028 Py_CLEAR(temp);
13029 goto onError;
13030 }
13031 pbuf = PyUnicode_DATA(temp);
13032 kind = PyUnicode_KIND(temp);
13033 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013034 if (prec >= 0 && len > prec)
13035 len = prec;
13036 break;
13037
13038 case 'i':
13039 case 'd':
13040 case 'u':
13041 case 'o':
13042 case 'x':
13043 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013044 isnumok = 0;
13045 if (PyNumber_Check(v)) {
13046 PyObject *iobj=NULL;
13047
13048 if (PyLong_Check(v)) {
13049 iobj = v;
13050 Py_INCREF(iobj);
13051 }
13052 else {
13053 iobj = PyNumber_Long(v);
13054 }
13055 if (iobj!=NULL) {
13056 if (PyLong_Check(iobj)) {
13057 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013058 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013059 Py_DECREF(iobj);
13060 if (!temp)
13061 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013062 if (PyUnicode_READY(temp) == -1) {
13063 Py_CLEAR(temp);
13064 goto onError;
13065 }
13066 pbuf = PyUnicode_DATA(temp);
13067 kind = PyUnicode_KIND(temp);
13068 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 sign = 1;
13070 }
13071 else {
13072 Py_DECREF(iobj);
13073 }
13074 }
13075 }
13076 if (!isnumok) {
13077 PyErr_Format(PyExc_TypeError,
13078 "%%%c format: a number is required, "
13079 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13080 goto onError;
13081 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013082 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013083 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013084 fillobj = zero;
13085 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 break;
13087
13088 case 'e':
13089 case 'E':
13090 case 'f':
13091 case 'F':
13092 case 'g':
13093 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013094 temp = formatfloat(v, flags, prec, c);
13095 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013096 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013097 if (PyUnicode_READY(temp) == -1) {
13098 Py_CLEAR(temp);
13099 goto onError;
13100 }
13101 pbuf = PyUnicode_DATA(temp);
13102 kind = PyUnicode_KIND(temp);
13103 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013104 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013105 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013106 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013107 fillobj = zero;
13108 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013109 break;
13110
13111 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013112 {
13113 Py_UCS4 ch = formatchar(v);
13114 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013115 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013116 temp = _PyUnicode_FromUCS4(&ch, 1);
13117 if (temp == NULL)
13118 goto onError;
13119 pbuf = PyUnicode_DATA(temp);
13120 kind = PyUnicode_KIND(temp);
13121 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013123 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013124
13125 default:
13126 PyErr_Format(PyExc_ValueError,
13127 "unsupported format character '%c' (0x%x) "
13128 "at index %zd",
13129 (31<=c && c<=126) ? (char)c : '?',
13130 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 goto onError;
13133 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 /* pbuf is initialized here. */
13135 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013136 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013137 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13138 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013140 pindex++;
13141 }
13142 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13143 signobj = plus;
13144 len--;
13145 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013146 }
13147 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013148 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013149 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013150 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013151 else
13152 sign = 0;
13153 }
13154 if (width < len)
13155 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013157 if (fill != ' ') {
13158 assert(signobj != NULL);
13159 if (_PyAccu_Accumulate(&acc, signobj))
13160 goto onError;
13161 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013162 if (width > len)
13163 width--;
13164 }
13165 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013166 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013167 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013168 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013169 second = get_latin1_char(
13170 PyUnicode_READ(kind, pbuf, pindex + 1));
13171 pindex += 2;
13172 if (second == NULL ||
13173 _PyAccu_Accumulate(&acc, zero) ||
13174 _PyAccu_Accumulate(&acc, second))
13175 goto onError;
13176 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013178 width -= 2;
13179 if (width < 0)
13180 width = 0;
13181 len -= 2;
13182 }
13183 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013184 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013185 if (repeat_accumulate(&acc, fillobj, width - len))
13186 goto onError;
13187 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 }
13189 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013190 if (sign) {
13191 assert(signobj != NULL);
13192 if (_PyAccu_Accumulate(&acc, signobj))
13193 goto onError;
13194 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13197 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013198 second = get_latin1_char(
13199 PyUnicode_READ(kind, pbuf, pindex + 1));
13200 pindex += 2;
13201 if (second == NULL ||
13202 _PyAccu_Accumulate(&acc, zero) ||
13203 _PyAccu_Accumulate(&acc, second))
13204 goto onError;
13205 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013206 }
13207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013209 if (temp != NULL) {
13210 assert(pbuf == PyUnicode_DATA(temp));
13211 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013212 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013213 else {
13214 const char *p = (const char *) pbuf;
13215 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013216 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013217 v = PyUnicode_FromKindAndData(kind, p, len);
13218 }
13219 if (v == NULL)
13220 goto onError;
13221 r = _PyAccu_Accumulate(&acc, v);
13222 Py_DECREF(v);
13223 if (r)
13224 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013225 if (width > len && repeat_accumulate(&acc, blank, width - len))
13226 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013227 if (dict && (argidx < arglen) && c != '%') {
13228 PyErr_SetString(PyExc_TypeError,
13229 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 goto onError;
13231 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013232 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013233 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013234 } /* until end */
13235 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013236 PyErr_SetString(PyExc_TypeError,
13237 "not all arguments converted during string formatting");
13238 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239 }
13240
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013241 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013242 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013244 }
13245 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013246 Py_XDECREF(temp);
13247 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013248 return (PyObject *)result;
13249
Benjamin Peterson29060642009-01-31 22:14:21 +000013250 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013252 Py_XDECREF(temp);
13253 Py_XDECREF(second);
13254 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013256 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 }
13258 return NULL;
13259}
13260
Jeremy Hylton938ace62002-07-17 16:30:39 +000013261static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013262unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13263
Tim Peters6d6c1a32001-08-02 04:15:00 +000013264static PyObject *
13265unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13266{
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013268 static char *kwlist[] = {"object", "encoding", "errors", 0};
13269 char *encoding = NULL;
13270 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013271
Benjamin Peterson14339b62009-01-31 16:36:08 +000013272 if (type != &PyUnicode_Type)
13273 return unicode_subtype_new(type, args, kwds);
13274 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013275 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013276 return NULL;
13277 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013279 if (encoding == NULL && errors == NULL)
13280 return PyObject_Str(x);
13281 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013283}
13284
Guido van Rossume023fe02001-08-30 03:12:59 +000013285static PyObject *
13286unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13287{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013288 PyUnicodeObject *unicode, *self;
13289 Py_ssize_t length, char_size;
13290 int share_wstr, share_utf8;
13291 unsigned int kind;
13292 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013293
Benjamin Peterson14339b62009-01-31 16:36:08 +000013294 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013295
13296 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13297 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013298 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013299 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013300 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013301 return NULL;
13302
13303 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13304 if (self == NULL) {
13305 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013306 return NULL;
13307 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013308 kind = PyUnicode_KIND(unicode);
13309 length = PyUnicode_GET_LENGTH(unicode);
13310
13311 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013312#ifdef Py_DEBUG
13313 _PyUnicode_HASH(self) = -1;
13314#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013315 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013316#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013317 _PyUnicode_STATE(self).interned = 0;
13318 _PyUnicode_STATE(self).kind = kind;
13319 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013320 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013321 _PyUnicode_STATE(self).ready = 1;
13322 _PyUnicode_WSTR(self) = NULL;
13323 _PyUnicode_UTF8_LENGTH(self) = 0;
13324 _PyUnicode_UTF8(self) = NULL;
13325 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013326 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013327
13328 share_utf8 = 0;
13329 share_wstr = 0;
13330 if (kind == PyUnicode_1BYTE_KIND) {
13331 char_size = 1;
13332 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13333 share_utf8 = 1;
13334 }
13335 else if (kind == PyUnicode_2BYTE_KIND) {
13336 char_size = 2;
13337 if (sizeof(wchar_t) == 2)
13338 share_wstr = 1;
13339 }
13340 else {
13341 assert(kind == PyUnicode_4BYTE_KIND);
13342 char_size = 4;
13343 if (sizeof(wchar_t) == 4)
13344 share_wstr = 1;
13345 }
13346
13347 /* Ensure we won't overflow the length. */
13348 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13349 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013351 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013352 data = PyObject_MALLOC((length + 1) * char_size);
13353 if (data == NULL) {
13354 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 goto onError;
13356 }
13357
Victor Stinnerc3c74152011-10-02 20:39:55 +020013358 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013359 if (share_utf8) {
13360 _PyUnicode_UTF8_LENGTH(self) = length;
13361 _PyUnicode_UTF8(self) = data;
13362 }
13363 if (share_wstr) {
13364 _PyUnicode_WSTR_LENGTH(self) = length;
13365 _PyUnicode_WSTR(self) = (wchar_t *)data;
13366 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013367
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013368 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013369 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013370 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013371 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013372#ifdef Py_DEBUG
13373 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13374#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013375 return (PyObject *)self;
13376
13377onError:
13378 Py_DECREF(unicode);
13379 Py_DECREF(self);
13380 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013381}
13382
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013383PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013385\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013386Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013387encoding defaults to the current default string encoding.\n\
13388errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013389
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013390static PyObject *unicode_iter(PyObject *seq);
13391
Guido van Rossumd57fd912000-03-10 22:53:23 +000013392PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013393 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013394 "str", /* tp_name */
13395 sizeof(PyUnicodeObject), /* tp_size */
13396 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013398 (destructor)unicode_dealloc, /* tp_dealloc */
13399 0, /* tp_print */
13400 0, /* tp_getattr */
13401 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013402 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013403 unicode_repr, /* tp_repr */
13404 &unicode_as_number, /* tp_as_number */
13405 &unicode_as_sequence, /* tp_as_sequence */
13406 &unicode_as_mapping, /* tp_as_mapping */
13407 (hashfunc) unicode_hash, /* tp_hash*/
13408 0, /* tp_call*/
13409 (reprfunc) unicode_str, /* tp_str */
13410 PyObject_GenericGetAttr, /* tp_getattro */
13411 0, /* tp_setattro */
13412 0, /* tp_as_buffer */
13413 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013414 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 unicode_doc, /* tp_doc */
13416 0, /* tp_traverse */
13417 0, /* tp_clear */
13418 PyUnicode_RichCompare, /* tp_richcompare */
13419 0, /* tp_weaklistoffset */
13420 unicode_iter, /* tp_iter */
13421 0, /* tp_iternext */
13422 unicode_methods, /* tp_methods */
13423 0, /* tp_members */
13424 0, /* tp_getset */
13425 &PyBaseObject_Type, /* tp_base */
13426 0, /* tp_dict */
13427 0, /* tp_descr_get */
13428 0, /* tp_descr_set */
13429 0, /* tp_dictoffset */
13430 0, /* tp_init */
13431 0, /* tp_alloc */
13432 unicode_new, /* tp_new */
13433 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013434};
13435
13436/* Initialize the Unicode implementation */
13437
Thomas Wouters78890102000-07-22 19:25:51 +000013438void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013440 int i;
13441
Thomas Wouters477c8d52006-05-27 19:21:47 +000013442 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013444 0x000A, /* LINE FEED */
13445 0x000D, /* CARRIAGE RETURN */
13446 0x001C, /* FILE SEPARATOR */
13447 0x001D, /* GROUP SEPARATOR */
13448 0x001E, /* RECORD SEPARATOR */
13449 0x0085, /* NEXT LINE */
13450 0x2028, /* LINE SEPARATOR */
13451 0x2029, /* PARAGRAPH SEPARATOR */
13452 };
13453
Fred Drakee4315f52000-05-09 19:53:39 +000013454 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013455 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013456 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013457 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013458 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013459
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013460 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013462 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013464
13465 /* initialize the linebreak bloom filter */
13466 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013467 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013468 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013469
13470 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013471}
13472
13473/* Finalize the Unicode implementation */
13474
Christian Heimesa156e092008-02-16 07:38:31 +000013475int
13476PyUnicode_ClearFreeList(void)
13477{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013478 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013479}
13480
Guido van Rossumd57fd912000-03-10 22:53:23 +000013481void
Thomas Wouters78890102000-07-22 19:25:51 +000013482_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013483{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013484 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013486 Py_XDECREF(unicode_empty);
13487 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013488
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013489 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 if (unicode_latin1[i]) {
13491 Py_DECREF(unicode_latin1[i]);
13492 unicode_latin1[i] = NULL;
13493 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013494 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013495 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013496 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013497}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013498
Walter Dörwald16807132007-05-25 13:52:07 +000013499void
13500PyUnicode_InternInPlace(PyObject **p)
13501{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013502 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13503 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013504#ifdef Py_DEBUG
13505 assert(s != NULL);
13506 assert(_PyUnicode_CHECK(s));
13507#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013508 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013509 return;
13510#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013511 /* If it's a subclass, we don't really know what putting
13512 it in the interned dict might do. */
13513 if (!PyUnicode_CheckExact(s))
13514 return;
13515 if (PyUnicode_CHECK_INTERNED(s))
13516 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013517 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013518 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519 return;
13520 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013521 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013522 if (interned == NULL) {
13523 interned = PyDict_New();
13524 if (interned == NULL) {
13525 PyErr_Clear(); /* Don't leave an exception */
13526 return;
13527 }
13528 }
13529 /* It might be that the GetItem call fails even
13530 though the key is present in the dictionary,
13531 namely when this happens during a stack overflow. */
13532 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013533 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013535
Benjamin Peterson29060642009-01-31 22:14:21 +000013536 if (t) {
13537 Py_INCREF(t);
13538 Py_DECREF(*p);
13539 *p = t;
13540 return;
13541 }
Walter Dörwald16807132007-05-25 13:52:07 +000013542
Benjamin Peterson14339b62009-01-31 16:36:08 +000013543 PyThreadState_GET()->recursion_critical = 1;
13544 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13545 PyErr_Clear();
13546 PyThreadState_GET()->recursion_critical = 0;
13547 return;
13548 }
13549 PyThreadState_GET()->recursion_critical = 0;
13550 /* The two references in interned are not counted by refcnt.
13551 The deallocator will take care of this */
13552 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013553 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013554}
13555
13556void
13557PyUnicode_InternImmortal(PyObject **p)
13558{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013559 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13560
Benjamin Peterson14339b62009-01-31 16:36:08 +000013561 PyUnicode_InternInPlace(p);
13562 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013563 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013564 Py_INCREF(*p);
13565 }
Walter Dörwald16807132007-05-25 13:52:07 +000013566}
13567
13568PyObject *
13569PyUnicode_InternFromString(const char *cp)
13570{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013571 PyObject *s = PyUnicode_FromString(cp);
13572 if (s == NULL)
13573 return NULL;
13574 PyUnicode_InternInPlace(&s);
13575 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013576}
13577
Alexander Belopolsky40018472011-02-26 01:02:56 +000013578void
13579_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013580{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013581 PyObject *keys;
13582 PyUnicodeObject *s;
13583 Py_ssize_t i, n;
13584 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013585
Benjamin Peterson14339b62009-01-31 16:36:08 +000013586 if (interned == NULL || !PyDict_Check(interned))
13587 return;
13588 keys = PyDict_Keys(interned);
13589 if (keys == NULL || !PyList_Check(keys)) {
13590 PyErr_Clear();
13591 return;
13592 }
Walter Dörwald16807132007-05-25 13:52:07 +000013593
Benjamin Peterson14339b62009-01-31 16:36:08 +000013594 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13595 detector, interned unicode strings are not forcibly deallocated;
13596 rather, we give them their stolen references back, and then clear
13597 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013598
Benjamin Peterson14339b62009-01-31 16:36:08 +000013599 n = PyList_GET_SIZE(keys);
13600 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013602 for (i = 0; i < n; i++) {
13603 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013604 if (PyUnicode_READY(s) == -1) {
13605 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013608 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013609 case SSTATE_NOT_INTERNED:
13610 /* XXX Shouldn't happen */
13611 break;
13612 case SSTATE_INTERNED_IMMORTAL:
13613 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013615 break;
13616 case SSTATE_INTERNED_MORTAL:
13617 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013618 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013619 break;
13620 default:
13621 Py_FatalError("Inconsistent interned string state.");
13622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013623 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013624 }
13625 fprintf(stderr, "total size of all interned strings: "
13626 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13627 "mortal/immortal\n", mortal_size, immortal_size);
13628 Py_DECREF(keys);
13629 PyDict_Clear(interned);
13630 Py_DECREF(interned);
13631 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013632}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013633
13634
13635/********************* Unicode Iterator **************************/
13636
13637typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013638 PyObject_HEAD
13639 Py_ssize_t it_index;
13640 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013641} unicodeiterobject;
13642
13643static void
13644unicodeiter_dealloc(unicodeiterobject *it)
13645{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013646 _PyObject_GC_UNTRACK(it);
13647 Py_XDECREF(it->it_seq);
13648 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013649}
13650
13651static int
13652unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13653{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013654 Py_VISIT(it->it_seq);
13655 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013656}
13657
13658static PyObject *
13659unicodeiter_next(unicodeiterobject *it)
13660{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013661 PyUnicodeObject *seq;
13662 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013663
Benjamin Peterson14339b62009-01-31 16:36:08 +000013664 assert(it != NULL);
13665 seq = it->it_seq;
13666 if (seq == NULL)
13667 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013668 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013669
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13671 int kind = PyUnicode_KIND(seq);
13672 void *data = PyUnicode_DATA(seq);
13673 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13674 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013675 if (item != NULL)
13676 ++it->it_index;
13677 return item;
13678 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013679
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 Py_DECREF(seq);
13681 it->it_seq = NULL;
13682 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013683}
13684
13685static PyObject *
13686unicodeiter_len(unicodeiterobject *it)
13687{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013688 Py_ssize_t len = 0;
13689 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013690 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013691 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013692}
13693
13694PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13695
13696static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013697 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013699 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013700};
13701
13702PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013703 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13704 "str_iterator", /* tp_name */
13705 sizeof(unicodeiterobject), /* tp_basicsize */
13706 0, /* tp_itemsize */
13707 /* methods */
13708 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13709 0, /* tp_print */
13710 0, /* tp_getattr */
13711 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013712 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013713 0, /* tp_repr */
13714 0, /* tp_as_number */
13715 0, /* tp_as_sequence */
13716 0, /* tp_as_mapping */
13717 0, /* tp_hash */
13718 0, /* tp_call */
13719 0, /* tp_str */
13720 PyObject_GenericGetAttr, /* tp_getattro */
13721 0, /* tp_setattro */
13722 0, /* tp_as_buffer */
13723 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13724 0, /* tp_doc */
13725 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13726 0, /* tp_clear */
13727 0, /* tp_richcompare */
13728 0, /* tp_weaklistoffset */
13729 PyObject_SelfIter, /* tp_iter */
13730 (iternextfunc)unicodeiter_next, /* tp_iternext */
13731 unicodeiter_methods, /* tp_methods */
13732 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013733};
13734
13735static PyObject *
13736unicode_iter(PyObject *seq)
13737{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013738 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013739
Benjamin Peterson14339b62009-01-31 16:36:08 +000013740 if (!PyUnicode_Check(seq)) {
13741 PyErr_BadInternalCall();
13742 return NULL;
13743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013744 if (PyUnicode_READY(seq) == -1)
13745 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013746 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13747 if (it == NULL)
13748 return NULL;
13749 it->it_index = 0;
13750 Py_INCREF(seq);
13751 it->it_seq = (PyUnicodeObject *)seq;
13752 _PyObject_GC_TRACK(it);
13753 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013754}
13755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013756#define UNIOP(x) Py_UNICODE_##x
13757#define UNIOP_t Py_UNICODE
13758#include "uniops.h"
13759#undef UNIOP
13760#undef UNIOP_t
13761#define UNIOP(x) Py_UCS4_##x
13762#define UNIOP_t Py_UCS4
13763#include "uniops.h"
13764#undef UNIOP
13765#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013766
Victor Stinner71133ff2010-09-01 23:43:53 +000013767Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013768PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013769{
13770 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020013771 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000013772 Py_ssize_t size;
13773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013774 if (!PyUnicode_Check(unicode)) {
13775 PyErr_BadArgument();
13776 return NULL;
13777 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013778 u = PyUnicode_AsUnicode(object);
13779 if (u == NULL)
13780 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000013781 /* Ensure we won't overflow the size. */
13782 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13783 PyErr_NoMemory();
13784 return NULL;
13785 }
13786 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13787 size *= sizeof(Py_UNICODE);
13788 copy = PyMem_Malloc(size);
13789 if (copy == NULL) {
13790 PyErr_NoMemory();
13791 return NULL;
13792 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013793 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000013794 return copy;
13795}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013796
Georg Brandl66c221e2010-10-14 07:04:07 +000013797/* A _string module, to export formatter_parser and formatter_field_name_split
13798 to the string.Formatter class implemented in Python. */
13799
13800static PyMethodDef _string_methods[] = {
13801 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13802 METH_O, PyDoc_STR("split the argument as a field name")},
13803 {"formatter_parser", (PyCFunction) formatter_parser,
13804 METH_O, PyDoc_STR("parse the argument as a format string")},
13805 {NULL, NULL}
13806};
13807
13808static struct PyModuleDef _string_module = {
13809 PyModuleDef_HEAD_INIT,
13810 "_string",
13811 PyDoc_STR("string helper module"),
13812 0,
13813 _string_methods,
13814 NULL,
13815 NULL,
13816 NULL,
13817 NULL
13818};
13819
13820PyMODINIT_FUNC
13821PyInit__string(void)
13822{
13823 return PyModule_Create(&_string_module);
13824}
13825
13826
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013827#ifdef __cplusplus
13828}
13829#endif