blob: 5d5bb9a8a071d60aa7b7d80b3a6094138ef9a45f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Thomas Wouters477c8d52006-05-27 19:21:47 +0000432/* --- Bloom Filters ----------------------------------------------------- */
433
434/* stuff to implement simple "bloom filters" for Unicode characters.
435 to keep things simple, we use a single bitmask, using the least 5
436 bits from each unicode characters as the bit index. */
437
438/* the linebreak mask is set up by Unicode_Init below */
439
Antoine Pitrouf068f942010-01-13 14:19:12 +0000440#if LONG_BIT >= 128
441#define BLOOM_WIDTH 128
442#elif LONG_BIT >= 64
443#define BLOOM_WIDTH 64
444#elif LONG_BIT >= 32
445#define BLOOM_WIDTH 32
446#else
447#error "LONG_BIT is smaller than 32"
448#endif
449
Thomas Wouters477c8d52006-05-27 19:21:47 +0000450#define BLOOM_MASK unsigned long
451
452static BLOOM_MASK bloom_linebreak;
453
Antoine Pitrouf068f942010-01-13 14:19:12 +0000454#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
455#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000456
Benjamin Peterson29060642009-01-31 22:14:21 +0000457#define BLOOM_LINEBREAK(ch) \
458 ((ch) < 128U ? ascii_linebreak[(ch)] : \
459 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Alexander Belopolsky40018472011-02-26 01:02:56 +0000461Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200462make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000463{
464 /* calculate simple bloom-style bitmask for a given unicode string */
465
Antoine Pitrouf068f942010-01-13 14:19:12 +0000466 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467 Py_ssize_t i;
468
469 mask = 0;
470 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200471 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000472
473 return mask;
474}
475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200476#define BLOOM_MEMBER(mask, chr, str) \
477 (BLOOM(mask, chr) \
478 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000479
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200480/* Compilation of templated routines */
481
482#include "stringlib/asciilib.h"
483#include "stringlib/fastsearch.h"
484#include "stringlib/partition.h"
485#include "stringlib/split.h"
486#include "stringlib/count.h"
487#include "stringlib/find.h"
488#include "stringlib/find_max_char.h"
489#include "stringlib/localeutil.h"
490#include "stringlib/undef.h"
491
492#include "stringlib/ucs1lib.h"
493#include "stringlib/fastsearch.h"
494#include "stringlib/partition.h"
495#include "stringlib/split.h"
496#include "stringlib/count.h"
497#include "stringlib/find.h"
498#include "stringlib/find_max_char.h"
499#include "stringlib/localeutil.h"
500#include "stringlib/undef.h"
501
502#include "stringlib/ucs2lib.h"
503#include "stringlib/fastsearch.h"
504#include "stringlib/partition.h"
505#include "stringlib/split.h"
506#include "stringlib/count.h"
507#include "stringlib/find.h"
508#include "stringlib/find_max_char.h"
509#include "stringlib/localeutil.h"
510#include "stringlib/undef.h"
511
512#include "stringlib/ucs4lib.h"
513#include "stringlib/fastsearch.h"
514#include "stringlib/partition.h"
515#include "stringlib/split.h"
516#include "stringlib/count.h"
517#include "stringlib/find.h"
518#include "stringlib/find_max_char.h"
519#include "stringlib/localeutil.h"
520#include "stringlib/undef.h"
521
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200522#include "stringlib/unicodedefs.h"
523#include "stringlib/fastsearch.h"
524#include "stringlib/count.h"
525#include "stringlib/find.h"
526
Guido van Rossumd57fd912000-03-10 22:53:23 +0000527/* --- Unicode Object ----------------------------------------------------- */
528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200529static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200530fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200531
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200532Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
533 Py_ssize_t size, Py_UCS4 ch,
534 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
537
538 switch (kind) {
539 case PyUnicode_1BYTE_KIND:
540 {
541 Py_UCS1 ch1 = (Py_UCS1) ch;
542 if (ch1 == ch)
543 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
544 else
545 return -1;
546 }
547 case PyUnicode_2BYTE_KIND:
548 {
549 Py_UCS2 ch2 = (Py_UCS2) ch;
550 if (ch2 == ch)
551 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
552 else
553 return -1;
554 }
555 case PyUnicode_4BYTE_KIND:
556 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
557 default:
558 assert(0);
559 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200561}
562
Victor Stinnerfe226c02011-10-03 03:52:20 +0200563static PyObject*
564resize_compact(PyObject *unicode, Py_ssize_t length)
565{
566 Py_ssize_t char_size;
567 Py_ssize_t struct_size;
568 Py_ssize_t new_size;
569 int share_wstr;
570
571 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200572 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200573 if (PyUnicode_IS_COMPACT_ASCII(unicode))
574 struct_size = sizeof(PyASCIIObject);
575 else
576 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200577 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200578
579 _Py_DEC_REFTOTAL;
580 _Py_ForgetReference(unicode);
581
582 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
583 PyErr_NoMemory();
584 return NULL;
585 }
586 new_size = (struct_size + (length + 1) * char_size);
587
588 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
589 if (unicode == NULL) {
590 PyObject_Del(unicode);
591 PyErr_NoMemory();
592 return NULL;
593 }
594 _Py_NewReference(unicode);
595 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200596 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200597 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200598 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
599 _PyUnicode_WSTR_LENGTH(unicode) = length;
600 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
602 length, 0);
603 return unicode;
604}
605
Alexander Belopolsky40018472011-02-26 01:02:56 +0000606static int
Victor Stinner95663112011-10-04 01:03:50 +0200607resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000608{
Victor Stinner95663112011-10-04 01:03:50 +0200609 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200610 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200611 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000612
Victor Stinner95663112011-10-04 01:03:50 +0200613 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200614
615 if (PyUnicode_IS_READY(unicode)) {
616 Py_ssize_t char_size;
617 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200618 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200619 void *data;
620
621 data = _PyUnicode_DATA_ANY(unicode);
622 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200623 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200624 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
625 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200626 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
627 {
628 PyObject_DEL(_PyUnicode_UTF8(unicode));
629 _PyUnicode_UTF8(unicode) = NULL;
630 _PyUnicode_UTF8_LENGTH(unicode) = 0;
631 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200632
633 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
634 PyErr_NoMemory();
635 return -1;
636 }
637 new_size = (length + 1) * char_size;
638
639 data = (PyObject *)PyObject_REALLOC(data, new_size);
640 if (data == NULL) {
641 PyErr_NoMemory();
642 return -1;
643 }
644 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200645 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200646 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200647 _PyUnicode_WSTR_LENGTH(unicode) = length;
648 }
649 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200650 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_UTF8_LENGTH(unicode) = length;
652 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200653 _PyUnicode_LENGTH(unicode) = length;
654 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200655 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200656 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200658 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200659 }
Victor Stinner95663112011-10-04 01:03:50 +0200660 assert(_PyUnicode_WSTR(unicode) != NULL);
661
662 /* check for integer overflow */
663 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
664 PyErr_NoMemory();
665 return -1;
666 }
667 wstr = _PyUnicode_WSTR(unicode);
668 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
669 if (!wstr) {
670 PyErr_NoMemory();
671 return -1;
672 }
673 _PyUnicode_WSTR(unicode) = wstr;
674 _PyUnicode_WSTR(unicode)[length] = 0;
675 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200676 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000677 return 0;
678}
679
Victor Stinnerfe226c02011-10-03 03:52:20 +0200680static PyObject*
681resize_copy(PyObject *unicode, Py_ssize_t length)
682{
683 Py_ssize_t copy_length;
684 if (PyUnicode_IS_COMPACT(unicode)) {
685 PyObject *copy;
686 assert(PyUnicode_IS_READY(unicode));
687
688 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
689 if (copy == NULL)
690 return NULL;
691
692 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200693 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200694 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200695 }
696 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200697 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 assert(_PyUnicode_WSTR(unicode) != NULL);
699 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200700 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200701 if (w == NULL)
702 return NULL;
703 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
704 copy_length = Py_MIN(copy_length, length);
705 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
706 copy_length);
707 return (PyObject*)w;
708 }
709}
710
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000712 Ux0000 terminated; some code (e.g. new_identifier)
713 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714
715 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000716 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717
718*/
719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200720#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200721static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200722#endif
723
Alexander Belopolsky40018472011-02-26 01:02:56 +0000724static PyUnicodeObject *
725_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726{
727 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200728 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000729
Thomas Wouters477c8d52006-05-27 19:21:47 +0000730 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 if (length == 0 && unicode_empty != NULL) {
732 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200733 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734 }
735
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000736 /* Ensure we won't overflow the size. */
737 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
738 return (PyUnicodeObject *)PyErr_NoMemory();
739 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200740 if (length < 0) {
741 PyErr_SetString(PyExc_SystemError,
742 "Negative size passed to _PyUnicode_New");
743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200746#ifdef Py_DEBUG
747 ++unicode_old_new_calls;
748#endif
749
750 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
751 if (unicode == NULL)
752 return NULL;
753 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
754 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
755 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000756 PyErr_NoMemory();
757 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000758 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200759
Jeremy Hyltond8082792003-09-16 19:41:39 +0000760 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000761 * the caller fails before initializing str -- unicode_resize()
762 * reads str[0], and the Keep-Alive optimization can keep memory
763 * allocated for str alive across a call to unicode_dealloc(unicode).
764 * We don't want unicode_resize to read uninitialized memory in
765 * that case.
766 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200767 _PyUnicode_WSTR(unicode)[0] = 0;
768 _PyUnicode_WSTR(unicode)[length] = 0;
769 _PyUnicode_WSTR_LENGTH(unicode) = length;
770 _PyUnicode_HASH(unicode) = -1;
771 _PyUnicode_STATE(unicode).interned = 0;
772 _PyUnicode_STATE(unicode).kind = 0;
773 _PyUnicode_STATE(unicode).compact = 0;
774 _PyUnicode_STATE(unicode).ready = 0;
775 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200776 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200777 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200778 _PyUnicode_UTF8(unicode) = NULL;
779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000781
Benjamin Peterson29060642009-01-31 22:14:21 +0000782 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000783 /* XXX UNREF/NEWREF interface should be more symmetrical */
784 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000785 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000786 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000788}
789
Victor Stinnerf42dc442011-10-02 23:33:16 +0200790static const char*
791unicode_kind_name(PyObject *unicode)
792{
Victor Stinner42dfd712011-10-03 14:41:45 +0200793 /* don't check consistency: unicode_kind_name() is called from
794 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200795 if (!PyUnicode_IS_COMPACT(unicode))
796 {
797 if (!PyUnicode_IS_READY(unicode))
798 return "wstr";
799 switch(PyUnicode_KIND(unicode))
800 {
801 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200802 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200803 return "legacy ascii";
804 else
805 return "legacy latin1";
806 case PyUnicode_2BYTE_KIND:
807 return "legacy UCS2";
808 case PyUnicode_4BYTE_KIND:
809 return "legacy UCS4";
810 default:
811 return "<legacy invalid kind>";
812 }
813 }
814 assert(PyUnicode_IS_READY(unicode));
815 switch(PyUnicode_KIND(unicode))
816 {
817 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200818 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200819 return "ascii";
820 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200821 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200822 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200825 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200826 default:
827 return "<invalid compact kind>";
828 }
829}
830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200831#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200832static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200833
834/* Functions wrapping macros for use in debugger */
835char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200836 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200837}
838
839void *_PyUnicode_compact_data(void *unicode) {
840 return _PyUnicode_COMPACT_DATA(unicode);
841}
842void *_PyUnicode_data(void *unicode){
843 printf("obj %p\n", unicode);
844 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
845 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
846 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
847 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
848 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
849 return PyUnicode_DATA(unicode);
850}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200851
852void
853_PyUnicode_Dump(PyObject *op)
854{
855 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200856 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
857 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
858 void *data;
859 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
860 if (ascii->state.compact)
861 data = (compact + 1);
862 else
863 data = unicode->data.any;
864 if (ascii->wstr == data)
865 printf("shared ");
866 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200867 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200868 printf(" (%zu), ", compact->wstr_length);
869 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
870 printf("shared ");
871 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200872 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200874}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200875#endif
876
877PyObject *
878PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
879{
880 PyObject *obj;
881 PyCompactUnicodeObject *unicode;
882 void *data;
883 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200884 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885 Py_ssize_t char_size;
886 Py_ssize_t struct_size;
887
888 /* Optimization for empty strings */
889 if (size == 0 && unicode_empty != NULL) {
890 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200891 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 }
893
894#ifdef Py_DEBUG
895 ++unicode_new_new_calls;
896#endif
897
Victor Stinner9e9d6892011-10-04 01:02:02 +0200898 is_ascii = 0;
899 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 struct_size = sizeof(PyCompactUnicodeObject);
901 if (maxchar < 128) {
902 kind_state = PyUnicode_1BYTE_KIND;
903 char_size = 1;
904 is_ascii = 1;
905 struct_size = sizeof(PyASCIIObject);
906 }
907 else if (maxchar < 256) {
908 kind_state = PyUnicode_1BYTE_KIND;
909 char_size = 1;
910 }
911 else if (maxchar < 65536) {
912 kind_state = PyUnicode_2BYTE_KIND;
913 char_size = 2;
914 if (sizeof(wchar_t) == 2)
915 is_sharing = 1;
916 }
917 else {
918 kind_state = PyUnicode_4BYTE_KIND;
919 char_size = 4;
920 if (sizeof(wchar_t) == 4)
921 is_sharing = 1;
922 }
923
924 /* Ensure we won't overflow the size. */
925 if (size < 0) {
926 PyErr_SetString(PyExc_SystemError,
927 "Negative size passed to PyUnicode_New");
928 return NULL;
929 }
930 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
931 return PyErr_NoMemory();
932
933 /* Duplicated allocation code from _PyObject_New() instead of a call to
934 * PyObject_New() so we are able to allocate space for the object and
935 * it's data buffer.
936 */
937 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
938 if (obj == NULL)
939 return PyErr_NoMemory();
940 obj = PyObject_INIT(obj, &PyUnicode_Type);
941 if (obj == NULL)
942 return NULL;
943
944 unicode = (PyCompactUnicodeObject *)obj;
945 if (is_ascii)
946 data = ((PyASCIIObject*)obj) + 1;
947 else
948 data = unicode + 1;
949 _PyUnicode_LENGTH(unicode) = size;
950 _PyUnicode_HASH(unicode) = -1;
951 _PyUnicode_STATE(unicode).interned = 0;
952 _PyUnicode_STATE(unicode).kind = kind_state;
953 _PyUnicode_STATE(unicode).compact = 1;
954 _PyUnicode_STATE(unicode).ready = 1;
955 _PyUnicode_STATE(unicode).ascii = is_ascii;
956 if (is_ascii) {
957 ((char*)data)[size] = 0;
958 _PyUnicode_WSTR(unicode) = NULL;
959 }
960 else if (kind_state == PyUnicode_1BYTE_KIND) {
961 ((char*)data)[size] = 0;
962 _PyUnicode_WSTR(unicode) = NULL;
963 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200964 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200965 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200966 }
967 else {
968 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200969 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200970 if (kind_state == PyUnicode_2BYTE_KIND)
971 ((Py_UCS2*)data)[size] = 0;
972 else /* kind_state == PyUnicode_4BYTE_KIND */
973 ((Py_UCS4*)data)[size] = 0;
974 if (is_sharing) {
975 _PyUnicode_WSTR_LENGTH(unicode) = size;
976 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
977 }
978 else {
979 _PyUnicode_WSTR_LENGTH(unicode) = 0;
980 _PyUnicode_WSTR(unicode) = NULL;
981 }
982 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200983 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200984 return obj;
985}
986
987#if SIZEOF_WCHAR_T == 2
988/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
989 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200990 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200991
992 This function assumes that unicode can hold one more code point than wstr
993 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200994static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200995unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
996 PyUnicodeObject *unicode)
997{
998 const wchar_t *iter;
999 Py_UCS4 *ucs4_out;
1000
Victor Stinner910337b2011-10-03 03:20:16 +02001001 assert(unicode != NULL);
1002 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001003 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1004 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1005
1006 for (iter = begin; iter < end; ) {
1007 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1008 _PyUnicode_GET_LENGTH(unicode)));
1009 if (*iter >= 0xD800 && *iter <= 0xDBFF
1010 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1011 {
1012 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1013 iter += 2;
1014 }
1015 else {
1016 *ucs4_out++ = *iter;
1017 iter++;
1018 }
1019 }
1020 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1021 _PyUnicode_GET_LENGTH(unicode)));
1022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001023}
1024#endif
1025
Victor Stinnercd9950f2011-10-02 00:34:53 +02001026static int
1027_PyUnicode_Dirty(PyObject *unicode)
1028{
Victor Stinner910337b2011-10-03 03:20:16 +02001029 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001030 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001031 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001032 "Cannot modify a string having more than 1 reference");
1033 return -1;
1034 }
1035 _PyUnicode_DIRTY(unicode);
1036 return 0;
1037}
1038
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001039static int
1040_copy_characters(PyObject *to, Py_ssize_t to_start,
1041 PyObject *from, Py_ssize_t from_start,
1042 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001044 unsigned int from_kind, to_kind;
1045 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001046 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001048 assert(PyUnicode_Check(from));
1049 assert(PyUnicode_Check(to));
1050 assert(PyUnicode_IS_READY(from));
1051 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1054 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1055 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001056
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001057 if (how_many == 0)
1058 return 0;
1059
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001061 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001063 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001065#ifdef Py_DEBUG
1066 if (!check_maxchar
1067 && (from_kind > to_kind
1068 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001069 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001070 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1071 Py_UCS4 ch;
1072 Py_ssize_t i;
1073 for (i=0; i < how_many; i++) {
1074 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1075 assert(ch <= to_maxchar);
1076 }
1077 }
1078#endif
1079 fast = (from_kind == to_kind);
1080 if (check_maxchar
1081 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1082 {
1083 /* deny latin1 => ascii */
1084 fast = 0;
1085 }
1086
1087 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001088 Py_MEMCPY((char*)to_data + to_kind * to_start,
1089 (char*)from_data + from_kind * from_start,
1090 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001092 else if (from_kind == PyUnicode_1BYTE_KIND
1093 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001094 {
1095 _PyUnicode_CONVERT_BYTES(
1096 Py_UCS1, Py_UCS2,
1097 PyUnicode_1BYTE_DATA(from) + from_start,
1098 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1099 PyUnicode_2BYTE_DATA(to) + to_start
1100 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001101 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001102 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001103 && to_kind == PyUnicode_4BYTE_KIND)
1104 {
1105 _PyUnicode_CONVERT_BYTES(
1106 Py_UCS1, Py_UCS4,
1107 PyUnicode_1BYTE_DATA(from) + from_start,
1108 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1109 PyUnicode_4BYTE_DATA(to) + to_start
1110 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001111 }
1112 else if (from_kind == PyUnicode_2BYTE_KIND
1113 && to_kind == PyUnicode_4BYTE_KIND)
1114 {
1115 _PyUnicode_CONVERT_BYTES(
1116 Py_UCS2, Py_UCS4,
1117 PyUnicode_2BYTE_DATA(from) + from_start,
1118 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1119 PyUnicode_4BYTE_DATA(to) + to_start
1120 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001121 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001122 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001123 /* check if max_char(from substring) <= max_char(to) */
1124 if (from_kind > to_kind
1125 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001126 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001127 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001128 /* slow path to check for character overflow */
1129 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001131 Py_ssize_t i;
1132
Victor Stinner56c161a2011-10-06 02:47:11 +02001133#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001134 for (i=0; i < how_many; i++) {
1135 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001136 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001137 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1138 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001139#else
1140 if (!check_maxchar) {
1141 for (i=0; i < how_many; i++) {
1142 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1143 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1144 }
1145 }
1146 else {
1147 for (i=0; i < how_many; i++) {
1148 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1149 if (ch > to_maxchar)
1150 return 1;
1151 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1152 }
1153 }
1154#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001155 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001156 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001157 assert(0 && "inconsistent state");
1158 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001159 }
1160 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 return 0;
1162}
1163
1164static void
1165copy_characters(PyObject *to, Py_ssize_t to_start,
1166 PyObject *from, Py_ssize_t from_start,
1167 Py_ssize_t how_many)
1168{
1169 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1170}
1171
1172Py_ssize_t
1173PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1174 PyObject *from, Py_ssize_t from_start,
1175 Py_ssize_t how_many)
1176{
1177 int err;
1178
1179 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1180 PyErr_BadInternalCall();
1181 return -1;
1182 }
1183
1184 if (PyUnicode_READY(from))
1185 return -1;
1186 if (PyUnicode_READY(to))
1187 return -1;
1188
1189 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1190 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1191 PyErr_Format(PyExc_SystemError,
1192 "Cannot write %zi characters at %zi "
1193 "in a string of %zi characters",
1194 how_many, to_start, PyUnicode_GET_LENGTH(to));
1195 return -1;
1196 }
1197
1198 if (how_many == 0)
1199 return 0;
1200
1201 if (_PyUnicode_Dirty(to))
1202 return -1;
1203
1204 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1205 if (err) {
1206 PyErr_Format(PyExc_SystemError,
1207 "Cannot copy %s characters "
1208 "into a string of %s characters",
1209 unicode_kind_name(from),
1210 unicode_kind_name(to));
1211 return -1;
1212 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001213 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214}
1215
Victor Stinner17222162011-09-28 22:15:37 +02001216/* Find the maximum code point and count the number of surrogate pairs so a
1217 correct string length can be computed before converting a string to UCS4.
1218 This function counts single surrogates as a character and not as a pair.
1219
1220 Return 0 on success, or -1 on error. */
1221static int
1222find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1223 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224{
1225 const wchar_t *iter;
1226
Victor Stinnerc53be962011-10-02 21:33:54 +02001227 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 *num_surrogates = 0;
1229 *maxchar = 0;
1230
1231 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001232 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001234#if SIZEOF_WCHAR_T != 2
1235 if (*maxchar >= 0x10000)
1236 return 0;
1237#endif
1238 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001239#if SIZEOF_WCHAR_T == 2
1240 if (*iter >= 0xD800 && *iter <= 0xDBFF
1241 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1242 {
1243 Py_UCS4 surrogate_val;
1244 surrogate_val = (((iter[0] & 0x3FF)<<10)
1245 | (iter[1] & 0x3FF)) + 0x10000;
1246 ++(*num_surrogates);
1247 if (surrogate_val > *maxchar)
1248 *maxchar = surrogate_val;
1249 iter += 2;
1250 }
1251 else
1252 iter++;
1253#else
1254 iter++;
1255#endif
1256 }
1257 return 0;
1258}
1259
1260#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001261static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262#endif
1263
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001264static int
1265unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001266{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001267 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001268 wchar_t *end;
1269 Py_UCS4 maxchar = 0;
1270 Py_ssize_t num_surrogates;
1271#if SIZEOF_WCHAR_T == 2
1272 Py_ssize_t length_wo_surrogates;
1273#endif
1274
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001275 assert(p_obj != NULL);
1276 unicode = (PyUnicodeObject *)*p_obj;
1277
Georg Brandl7597add2011-10-05 16:36:47 +02001278 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001279 strings were created using _PyObject_New() and where no canonical
1280 representation (the str field) has been set yet aka strings
1281 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001282 assert(_PyUnicode_CHECK(unicode));
1283 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001284 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001285 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001286 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001287 /* Actually, it should neither be interned nor be anything else: */
1288 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289
1290#ifdef Py_DEBUG
1291 ++unicode_ready_calls;
1292#endif
1293
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001294#ifdef Py_DEBUG
1295 assert(!replace || Py_REFCNT(unicode) == 1);
1296#else
1297 if (replace && Py_REFCNT(unicode) != 1)
1298 replace = 0;
1299#endif
1300 if (replace) {
1301 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1302 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1303 /* Optimization for empty strings */
1304 if (len == 0) {
1305 Py_INCREF(unicode_empty);
1306 Py_DECREF(*p_obj);
1307 *p_obj = unicode_empty;
1308 return 0;
1309 }
1310 if (len == 1 && wstr[0] < 256) {
1311 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1312 if (latin1_char == NULL)
1313 return -1;
1314 Py_DECREF(*p_obj);
1315 *p_obj = latin1_char;
1316 return 0;
1317 }
1318 }
1319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001321 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001322 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001324
1325 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001326 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1327 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 PyErr_NoMemory();
1329 return -1;
1330 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001331 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 _PyUnicode_WSTR(unicode), end,
1333 PyUnicode_1BYTE_DATA(unicode));
1334 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1335 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1336 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1337 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001338 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001339 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001340 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 }
1342 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001343 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001344 _PyUnicode_UTF8(unicode) = NULL;
1345 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 PyObject_FREE(_PyUnicode_WSTR(unicode));
1348 _PyUnicode_WSTR(unicode) = NULL;
1349 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1350 }
1351 /* In this case we might have to convert down from 4-byte native
1352 wchar_t to 2-byte unicode. */
1353 else if (maxchar < 65536) {
1354 assert(num_surrogates == 0 &&
1355 "FindMaxCharAndNumSurrogatePairs() messed up");
1356
Victor Stinner506f5922011-09-28 22:34:18 +02001357#if SIZEOF_WCHAR_T == 2
1358 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001360 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1361 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1362 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001363 _PyUnicode_UTF8(unicode) = NULL;
1364 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001365#else
1366 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001367 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001368 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001369 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001370 PyErr_NoMemory();
1371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 }
Victor Stinner506f5922011-09-28 22:34:18 +02001373 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1374 _PyUnicode_WSTR(unicode), end,
1375 PyUnicode_2BYTE_DATA(unicode));
1376 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1377 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1378 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001379 _PyUnicode_UTF8(unicode) = NULL;
1380 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001381 PyObject_FREE(_PyUnicode_WSTR(unicode));
1382 _PyUnicode_WSTR(unicode) = NULL;
1383 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1384#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 }
1386 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1387 else {
1388#if SIZEOF_WCHAR_T == 2
1389 /* in case the native representation is 2-bytes, we need to allocate a
1390 new normalized 4-byte version. */
1391 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001392 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1393 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 PyErr_NoMemory();
1395 return -1;
1396 }
1397 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1398 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001399 _PyUnicode_UTF8(unicode) = NULL;
1400 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001401 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1402 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001403 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 PyObject_FREE(_PyUnicode_WSTR(unicode));
1405 _PyUnicode_WSTR(unicode) = NULL;
1406 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1407#else
1408 assert(num_surrogates == 0);
1409
Victor Stinnerc3c74152011-10-02 20:39:55 +02001410 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001412 _PyUnicode_UTF8(unicode) = NULL;
1413 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1415#endif
1416 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1417 }
1418 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001419 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 return 0;
1421}
1422
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001423int
1424_PyUnicode_ReadyReplace(PyObject **op)
1425{
1426 return unicode_ready(op, 1);
1427}
1428
1429int
1430_PyUnicode_Ready(PyObject *op)
1431{
1432 return unicode_ready(&op, 0);
1433}
1434
Alexander Belopolsky40018472011-02-26 01:02:56 +00001435static void
1436unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437{
Walter Dörwald16807132007-05-25 13:52:07 +00001438 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001439 case SSTATE_NOT_INTERNED:
1440 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001441
Benjamin Peterson29060642009-01-31 22:14:21 +00001442 case SSTATE_INTERNED_MORTAL:
1443 /* revive dead object temporarily for DelItem */
1444 Py_REFCNT(unicode) = 3;
1445 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1446 Py_FatalError(
1447 "deletion of interned string failed");
1448 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001449
Benjamin Peterson29060642009-01-31 22:14:21 +00001450 case SSTATE_INTERNED_IMMORTAL:
1451 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001452
Benjamin Peterson29060642009-01-31 22:14:21 +00001453 default:
1454 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001455 }
1456
Victor Stinner03490912011-10-03 23:45:12 +02001457 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001459 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461
1462 if (PyUnicode_IS_COMPACT(unicode)) {
1463 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 }
1465 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001466 if (_PyUnicode_DATA_ANY(unicode))
1467 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001468 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
1470}
1471
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001472#ifdef Py_DEBUG
1473static int
1474unicode_is_singleton(PyObject *unicode)
1475{
1476 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1477 if (unicode == unicode_empty)
1478 return 1;
1479 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1480 {
1481 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1482 if (ch < 256 && unicode_latin1[ch] == unicode)
1483 return 1;
1484 }
1485 return 0;
1486}
1487#endif
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001490unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001491{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001492 if (Py_REFCNT(unicode) != 1)
1493 return 0;
1494 if (PyUnicode_CHECK_INTERNED(unicode))
1495 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001496#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001497 /* singleton refcount is greater than 1 */
1498 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001499#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001500 return 1;
1501}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001502
Victor Stinnerfe226c02011-10-03 03:52:20 +02001503static int
1504unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1505{
1506 PyObject *unicode;
1507 Py_ssize_t old_length;
1508
1509 assert(p_unicode != NULL);
1510 unicode = *p_unicode;
1511
1512 assert(unicode != NULL);
1513 assert(PyUnicode_Check(unicode));
1514 assert(0 <= length);
1515
Victor Stinner910337b2011-10-03 03:20:16 +02001516 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001517 old_length = PyUnicode_WSTR_LENGTH(unicode);
1518 else
1519 old_length = PyUnicode_GET_LENGTH(unicode);
1520 if (old_length == length)
1521 return 0;
1522
Victor Stinnerfe226c02011-10-03 03:52:20 +02001523 if (!unicode_resizable(unicode)) {
1524 PyObject *copy = resize_copy(unicode, length);
1525 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001526 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001527 Py_DECREF(*p_unicode);
1528 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001529 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001530 }
1531
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 if (PyUnicode_IS_COMPACT(unicode)) {
1533 *p_unicode = resize_compact(unicode, length);
1534 if (*p_unicode == NULL)
1535 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001536 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001538 }
1539 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001540}
1541
Alexander Belopolsky40018472011-02-26 01:02:56 +00001542int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001543PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001544{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001545 PyObject *unicode;
1546 if (p_unicode == NULL) {
1547 PyErr_BadInternalCall();
1548 return -1;
1549 }
1550 unicode = *p_unicode;
1551 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1552 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1553 {
1554 PyErr_BadInternalCall();
1555 return -1;
1556 }
1557 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001558}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560static PyObject*
1561get_latin1_char(unsigned char ch)
1562{
Victor Stinnera464fc12011-10-02 20:39:30 +02001563 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001565 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 if (!unicode)
1567 return NULL;
1568 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001569 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 unicode_latin1[ch] = unicode;
1571 }
1572 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001573 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574}
1575
Alexander Belopolsky40018472011-02-26 01:02:56 +00001576PyObject *
1577PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
1579 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001580 Py_UCS4 maxchar = 0;
1581 Py_ssize_t num_surrogates;
1582
1583 if (u == NULL)
1584 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001586 /* If the Unicode data is known at construction time, we can apply
1587 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001589 /* Optimization for empty strings */
1590 if (size == 0 && unicode_empty != NULL) {
1591 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001592 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001593 }
Tim Petersced69f82003-09-16 20:30:58 +00001594
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001595 /* Single character Unicode objects in the Latin-1 range are
1596 shared when using this constructor */
1597 if (size == 1 && *u < 256)
1598 return get_latin1_char((unsigned char)*u);
1599
1600 /* If not empty and not single character, copy the Unicode data
1601 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001602 if (find_maxchar_surrogates(u, u + size,
1603 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604 return NULL;
1605
1606 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1607 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 if (!unicode)
1609 return NULL;
1610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 switch (PyUnicode_KIND(unicode)) {
1612 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001613 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1615 break;
1616 case PyUnicode_2BYTE_KIND:
1617#if Py_UNICODE_SIZE == 2
1618 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1619#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001620 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001621 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1622#endif
1623 break;
1624 case PyUnicode_4BYTE_KIND:
1625#if SIZEOF_WCHAR_T == 2
1626 /* This is the only case which has to process surrogates, thus
1627 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001628 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001629#else
1630 assert(num_surrogates == 0);
1631 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1632#endif
1633 break;
1634 default:
1635 assert(0 && "Impossible state");
1636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001637
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001638 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 return (PyObject *)unicode;
1640}
1641
Alexander Belopolsky40018472011-02-26 01:02:56 +00001642PyObject *
1643PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001644{
1645 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001646
Benjamin Peterson14339b62009-01-31 16:36:08 +00001647 if (size < 0) {
1648 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001649 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001650 return NULL;
1651 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001652
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001653 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001654 some optimizations which share commonly used objects.
1655 Also, this means the input must be UTF-8, so fall back to the
1656 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001657 if (u != NULL) {
1658
Benjamin Peterson29060642009-01-31 22:14:21 +00001659 /* Optimization for empty strings */
1660 if (size == 0 && unicode_empty != NULL) {
1661 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001662 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001663 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001664
1665 /* Single characters are shared when using this constructor.
1666 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 if (size == 1 && Py_CHARMASK(*u) < 128)
1668 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001669
1670 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001671 }
1672
Walter Dörwald55507312007-05-18 13:12:10 +00001673 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001674 if (!unicode)
1675 return NULL;
1676
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001677 return (PyObject *)unicode;
1678}
1679
Alexander Belopolsky40018472011-02-26 01:02:56 +00001680PyObject *
1681PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001682{
1683 size_t size = strlen(u);
1684 if (size > PY_SSIZE_T_MAX) {
1685 PyErr_SetString(PyExc_OverflowError, "input too long");
1686 return NULL;
1687 }
1688
1689 return PyUnicode_FromStringAndSize(u, size);
1690}
1691
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001692PyObject *
1693_PyUnicode_FromId(_Py_Identifier *id)
1694{
1695 if (!id->object) {
1696 id->object = PyUnicode_FromString(id->string);
1697 if (!id->object)
1698 return NULL;
1699 PyUnicode_InternInPlace(&id->object);
1700 assert(!id->next);
1701 id->next = static_strings;
1702 static_strings = id;
1703 }
1704 Py_INCREF(id->object);
1705 return id->object;
1706}
1707
1708void
1709_PyUnicode_ClearStaticStrings()
1710{
1711 _Py_Identifier *i;
1712 for (i = static_strings; i; i = i->next) {
1713 Py_DECREF(i->object);
1714 i->object = NULL;
1715 i->next = NULL;
1716 }
1717}
1718
Victor Stinnere57b1c02011-09-28 22:20:48 +02001719static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001720unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001721{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001722 PyObject *res;
1723#ifdef Py_DEBUG
1724 const unsigned char *p;
1725 const unsigned char *end = s + size;
1726 for (p=s; p < end; p++) {
1727 assert(*p < 128);
1728 }
1729#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001730 if (size == 1)
1731 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001732 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001733 if (!res)
1734 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001735 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001736 return res;
1737}
1738
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001739static Py_UCS4
1740kind_maxchar_limit(unsigned int kind)
1741{
1742 switch(kind) {
1743 case PyUnicode_1BYTE_KIND:
1744 return 0x80;
1745 case PyUnicode_2BYTE_KIND:
1746 return 0x100;
1747 case PyUnicode_4BYTE_KIND:
1748 return 0x10000;
1749 default:
1750 assert(0 && "invalid kind");
1751 return 0x10ffff;
1752 }
1753}
1754
Victor Stinner702c7342011-10-05 13:50:52 +02001755static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001756_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001759 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001760
1761 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001762 if (size == 1)
1763 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001764 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001765 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 if (!res)
1767 return NULL;
1768 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001769 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001771}
1772
Victor Stinnere57b1c02011-09-28 22:20:48 +02001773static PyObject*
1774_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775{
1776 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001777 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001778
1779 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001780 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001781 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001782 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 if (!res)
1785 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001786 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001788 else {
1789 _PyUnicode_CONVERT_BYTES(
1790 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1791 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001792 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return res;
1794}
1795
Victor Stinnere57b1c02011-09-28 22:20:48 +02001796static PyObject*
1797_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798{
1799 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001800 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001801
1802 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001803 if (size == 1 && u[0] < 256)
1804 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001805 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 if (!res)
1808 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001809 if (max_char < 256)
1810 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1811 PyUnicode_1BYTE_DATA(res));
1812 else if (max_char < 0x10000)
1813 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1814 PyUnicode_2BYTE_DATA(res));
1815 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001817 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return res;
1819}
1820
1821PyObject*
1822PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1823{
1824 switch(kind) {
1825 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001826 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001828 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001830 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001831 default:
1832 assert(0 && "invalid kind");
1833 PyErr_SetString(PyExc_SystemError, "invalid kind");
1834 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836}
1837
Victor Stinner25a4b292011-10-06 12:31:55 +02001838/* Ensure that a string uses the most efficient storage, if it is not the
1839 case: create a new string with of the right kind. Write NULL into *p_unicode
1840 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001841static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001842unicode_adjust_maxchar(PyObject **p_unicode)
1843{
1844 PyObject *unicode, *copy;
1845 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001846 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001847 unsigned int kind;
1848
1849 assert(p_unicode != NULL);
1850 unicode = *p_unicode;
1851 assert(PyUnicode_IS_READY(unicode));
1852 if (PyUnicode_IS_ASCII(unicode))
1853 return;
1854
1855 len = PyUnicode_GET_LENGTH(unicode);
1856 kind = PyUnicode_KIND(unicode);
1857 if (kind == PyUnicode_1BYTE_KIND) {
1858 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001859 max_char = ucs1lib_find_max_char(u, u + len);
1860 if (max_char >= 128)
1861 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001862 }
1863 else if (kind == PyUnicode_2BYTE_KIND) {
1864 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001865 max_char = ucs2lib_find_max_char(u, u + len);
1866 if (max_char >= 256)
1867 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001868 }
1869 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001870 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001871 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001872 max_char = ucs4lib_find_max_char(u, u + len);
1873 if (max_char >= 0x10000)
1874 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001875 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001876 copy = PyUnicode_New(len, max_char);
1877 copy_characters(copy, 0, unicode, 0, len);
1878 Py_DECREF(unicode);
1879 *p_unicode = copy;
1880}
1881
Victor Stinner034f6cf2011-09-30 02:26:44 +02001882PyObject*
1883PyUnicode_Copy(PyObject *unicode)
1884{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001885 Py_ssize_t size;
1886 PyObject *copy;
1887 void *data;
1888
Victor Stinner034f6cf2011-09-30 02:26:44 +02001889 if (!PyUnicode_Check(unicode)) {
1890 PyErr_BadInternalCall();
1891 return NULL;
1892 }
1893 if (PyUnicode_READY(unicode))
1894 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001895
1896 size = PyUnicode_GET_LENGTH(unicode);
1897 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1898 if (!copy)
1899 return NULL;
1900 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1901
1902 data = PyUnicode_DATA(unicode);
1903 switch (PyUnicode_KIND(unicode))
1904 {
1905 case PyUnicode_1BYTE_KIND:
1906 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1907 break;
1908 case PyUnicode_2BYTE_KIND:
1909 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1910 break;
1911 case PyUnicode_4BYTE_KIND:
1912 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1913 break;
1914 default:
1915 assert(0);
1916 break;
1917 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001918 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001919 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001920}
1921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922
Victor Stinnerbc603d12011-10-02 01:00:40 +02001923/* Widen Unicode objects to larger buffers. Don't write terminating null
1924 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001925
1926void*
1927_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1928{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001929 Py_ssize_t len;
1930 void *result;
1931 unsigned int skind;
1932
1933 if (PyUnicode_READY(s))
1934 return NULL;
1935
1936 len = PyUnicode_GET_LENGTH(s);
1937 skind = PyUnicode_KIND(s);
1938 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001939 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001940 return NULL;
1941 }
1942 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001943 case PyUnicode_2BYTE_KIND:
1944 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1945 if (!result)
1946 return PyErr_NoMemory();
1947 assert(skind == PyUnicode_1BYTE_KIND);
1948 _PyUnicode_CONVERT_BYTES(
1949 Py_UCS1, Py_UCS2,
1950 PyUnicode_1BYTE_DATA(s),
1951 PyUnicode_1BYTE_DATA(s) + len,
1952 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001953 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001954 case PyUnicode_4BYTE_KIND:
1955 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1956 if (!result)
1957 return PyErr_NoMemory();
1958 if (skind == PyUnicode_2BYTE_KIND) {
1959 _PyUnicode_CONVERT_BYTES(
1960 Py_UCS2, Py_UCS4,
1961 PyUnicode_2BYTE_DATA(s),
1962 PyUnicode_2BYTE_DATA(s) + len,
1963 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001964 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001965 else {
1966 assert(skind == PyUnicode_1BYTE_KIND);
1967 _PyUnicode_CONVERT_BYTES(
1968 Py_UCS1, Py_UCS4,
1969 PyUnicode_1BYTE_DATA(s),
1970 PyUnicode_1BYTE_DATA(s) + len,
1971 result);
1972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001974 default:
1975 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001976 }
Victor Stinner01698042011-10-04 00:04:26 +02001977 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 return NULL;
1979}
1980
1981static Py_UCS4*
1982as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1983 int copy_null)
1984{
1985 int kind;
1986 void *data;
1987 Py_ssize_t len, targetlen;
1988 if (PyUnicode_READY(string) == -1)
1989 return NULL;
1990 kind = PyUnicode_KIND(string);
1991 data = PyUnicode_DATA(string);
1992 len = PyUnicode_GET_LENGTH(string);
1993 targetlen = len;
1994 if (copy_null)
1995 targetlen++;
1996 if (!target) {
1997 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1998 PyErr_NoMemory();
1999 return NULL;
2000 }
2001 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2002 if (!target) {
2003 PyErr_NoMemory();
2004 return NULL;
2005 }
2006 }
2007 else {
2008 if (targetsize < targetlen) {
2009 PyErr_Format(PyExc_SystemError,
2010 "string is longer than the buffer");
2011 if (copy_null && 0 < targetsize)
2012 target[0] = 0;
2013 return NULL;
2014 }
2015 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002016 if (kind == PyUnicode_1BYTE_KIND) {
2017 Py_UCS1 *start = (Py_UCS1 *) data;
2018 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002020 else if (kind == PyUnicode_2BYTE_KIND) {
2021 Py_UCS2 *start = (Py_UCS2 *) data;
2022 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2023 }
2024 else {
2025 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (copy_null)
2029 target[len] = 0;
2030 return target;
2031}
2032
2033Py_UCS4*
2034PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2035 int copy_null)
2036{
2037 if (target == NULL || targetsize < 1) {
2038 PyErr_BadInternalCall();
2039 return NULL;
2040 }
2041 return as_ucs4(string, target, targetsize, copy_null);
2042}
2043
2044Py_UCS4*
2045PyUnicode_AsUCS4Copy(PyObject *string)
2046{
2047 return as_ucs4(string, NULL, 0, 1);
2048}
2049
2050#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002051
Alexander Belopolsky40018472011-02-26 01:02:56 +00002052PyObject *
2053PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002056 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002058 PyErr_BadInternalCall();
2059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
2061
Martin v. Löwis790465f2008-04-05 20:41:37 +00002062 if (size == -1) {
2063 size = wcslen(w);
2064 }
2065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002066 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067}
2068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002069#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002070
Walter Dörwald346737f2007-05-31 10:44:43 +00002071static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002072makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2073 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002074{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002075 *fmt++ = '%';
2076 if (width) {
2077 if (zeropad)
2078 *fmt++ = '0';
2079 fmt += sprintf(fmt, "%d", width);
2080 }
2081 if (precision)
2082 fmt += sprintf(fmt, ".%d", precision);
2083 if (longflag)
2084 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002085 else if (longlongflag) {
2086 /* longlongflag should only ever be nonzero on machines with
2087 HAVE_LONG_LONG defined */
2088#ifdef HAVE_LONG_LONG
2089 char *f = PY_FORMAT_LONG_LONG;
2090 while (*f)
2091 *fmt++ = *f++;
2092#else
2093 /* we shouldn't ever get here */
2094 assert(0);
2095 *fmt++ = 'l';
2096#endif
2097 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002098 else if (size_tflag) {
2099 char *f = PY_FORMAT_SIZE_T;
2100 while (*f)
2101 *fmt++ = *f++;
2102 }
2103 *fmt++ = c;
2104 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002105}
2106
Victor Stinner96865452011-03-01 23:44:09 +00002107/* helper for PyUnicode_FromFormatV() */
2108
2109static const char*
2110parse_format_flags(const char *f,
2111 int *p_width, int *p_precision,
2112 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2113{
2114 int width, precision, longflag, longlongflag, size_tflag;
2115
2116 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2117 f++;
2118 width = 0;
2119 while (Py_ISDIGIT((unsigned)*f))
2120 width = (width*10) + *f++ - '0';
2121 precision = 0;
2122 if (*f == '.') {
2123 f++;
2124 while (Py_ISDIGIT((unsigned)*f))
2125 precision = (precision*10) + *f++ - '0';
2126 if (*f == '%') {
2127 /* "%.3%s" => f points to "3" */
2128 f--;
2129 }
2130 }
2131 if (*f == '\0') {
2132 /* bogus format "%.1" => go backward, f points to "1" */
2133 f--;
2134 }
2135 if (p_width != NULL)
2136 *p_width = width;
2137 if (p_precision != NULL)
2138 *p_precision = precision;
2139
2140 /* Handle %ld, %lu, %lld and %llu. */
2141 longflag = 0;
2142 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002143 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002144
2145 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002146 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002147 longflag = 1;
2148 ++f;
2149 }
2150#ifdef HAVE_LONG_LONG
2151 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002152 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002153 longlongflag = 1;
2154 f += 2;
2155 }
2156#endif
2157 }
2158 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002159 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002160 size_tflag = 1;
2161 ++f;
2162 }
2163 if (p_longflag != NULL)
2164 *p_longflag = longflag;
2165 if (p_longlongflag != NULL)
2166 *p_longlongflag = longlongflag;
2167 if (p_size_tflag != NULL)
2168 *p_size_tflag = size_tflag;
2169 return f;
2170}
2171
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002172/* maximum number of characters required for output of %ld. 21 characters
2173 allows for 64-bit integers (in decimal) and an optional sign. */
2174#define MAX_LONG_CHARS 21
2175/* maximum number of characters required for output of %lld.
2176 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2177 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2178#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2179
Walter Dörwaldd2034312007-05-18 16:29:38 +00002180PyObject *
2181PyUnicode_FromFormatV(const char *format, va_list vargs)
2182{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002183 va_list count;
2184 Py_ssize_t callcount = 0;
2185 PyObject **callresults = NULL;
2186 PyObject **callresult = NULL;
2187 Py_ssize_t n = 0;
2188 int width = 0;
2189 int precision = 0;
2190 int zeropad;
2191 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002192 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002193 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002194 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2196 Py_UCS4 argmaxchar;
2197 Py_ssize_t numbersize = 0;
2198 char *numberresults = NULL;
2199 char *numberresult = NULL;
2200 Py_ssize_t i;
2201 int kind;
2202 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002203
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002204 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002205 /* step 1: count the number of %S/%R/%A/%s format specifications
2206 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2207 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002208 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002209 * also estimate a upper bound for all the number formats in the string,
2210 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002212 for (f = format; *f; f++) {
2213 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002214 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2216 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2217 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2218 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002221#ifdef HAVE_LONG_LONG
2222 if (longlongflag) {
2223 if (width < MAX_LONG_LONG_CHARS)
2224 width = MAX_LONG_LONG_CHARS;
2225 }
2226 else
2227#endif
2228 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2229 including sign. Decimal takes the most space. This
2230 isn't enough for octal. If a width is specified we
2231 need more (which we allocate later). */
2232 if (width < MAX_LONG_CHARS)
2233 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234
2235 /* account for the size + '\0' to separate numbers
2236 inside of the numberresults buffer */
2237 numbersize += (width + 1);
2238 }
2239 }
2240 else if ((unsigned char)*f > 127) {
2241 PyErr_Format(PyExc_ValueError,
2242 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2243 "string, got a non-ASCII byte: 0x%02x",
2244 (unsigned char)*f);
2245 return NULL;
2246 }
2247 }
2248 /* step 2: allocate memory for the results of
2249 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2250 if (callcount) {
2251 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2252 if (!callresults) {
2253 PyErr_NoMemory();
2254 return NULL;
2255 }
2256 callresult = callresults;
2257 }
2258 /* step 2.5: allocate memory for the results of formating numbers */
2259 if (numbersize) {
2260 numberresults = PyObject_Malloc(numbersize);
2261 if (!numberresults) {
2262 PyErr_NoMemory();
2263 goto fail;
2264 }
2265 numberresult = numberresults;
2266 }
2267
2268 /* step 3: format numbers and figure out how large a buffer we need */
2269 for (f = format; *f; f++) {
2270 if (*f == '%') {
2271 const char* p;
2272 int longflag;
2273 int longlongflag;
2274 int size_tflag;
2275 int numprinted;
2276
2277 p = f;
2278 zeropad = (f[1] == '0');
2279 f = parse_format_flags(f, &width, &precision,
2280 &longflag, &longlongflag, &size_tflag);
2281 switch (*f) {
2282 case 'c':
2283 {
2284 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002285 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 n++;
2287 break;
2288 }
2289 case '%':
2290 n++;
2291 break;
2292 case 'i':
2293 case 'd':
2294 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2295 width, precision, *f);
2296 if (longflag)
2297 numprinted = sprintf(numberresult, fmt,
2298 va_arg(count, long));
2299#ifdef HAVE_LONG_LONG
2300 else if (longlongflag)
2301 numprinted = sprintf(numberresult, fmt,
2302 va_arg(count, PY_LONG_LONG));
2303#endif
2304 else if (size_tflag)
2305 numprinted = sprintf(numberresult, fmt,
2306 va_arg(count, Py_ssize_t));
2307 else
2308 numprinted = sprintf(numberresult, fmt,
2309 va_arg(count, int));
2310 n += numprinted;
2311 /* advance by +1 to skip over the '\0' */
2312 numberresult += (numprinted + 1);
2313 assert(*(numberresult - 1) == '\0');
2314 assert(*(numberresult - 2) != '\0');
2315 assert(numprinted >= 0);
2316 assert(numberresult <= numberresults + numbersize);
2317 break;
2318 case 'u':
2319 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2320 width, precision, 'u');
2321 if (longflag)
2322 numprinted = sprintf(numberresult, fmt,
2323 va_arg(count, unsigned long));
2324#ifdef HAVE_LONG_LONG
2325 else if (longlongflag)
2326 numprinted = sprintf(numberresult, fmt,
2327 va_arg(count, unsigned PY_LONG_LONG));
2328#endif
2329 else if (size_tflag)
2330 numprinted = sprintf(numberresult, fmt,
2331 va_arg(count, size_t));
2332 else
2333 numprinted = sprintf(numberresult, fmt,
2334 va_arg(count, unsigned int));
2335 n += numprinted;
2336 numberresult += (numprinted + 1);
2337 assert(*(numberresult - 1) == '\0');
2338 assert(*(numberresult - 2) != '\0');
2339 assert(numprinted >= 0);
2340 assert(numberresult <= numberresults + numbersize);
2341 break;
2342 case 'x':
2343 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2344 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2345 n += numprinted;
2346 numberresult += (numprinted + 1);
2347 assert(*(numberresult - 1) == '\0');
2348 assert(*(numberresult - 2) != '\0');
2349 assert(numprinted >= 0);
2350 assert(numberresult <= numberresults + numbersize);
2351 break;
2352 case 'p':
2353 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2354 /* %p is ill-defined: ensure leading 0x. */
2355 if (numberresult[1] == 'X')
2356 numberresult[1] = 'x';
2357 else if (numberresult[1] != 'x') {
2358 memmove(numberresult + 2, numberresult,
2359 strlen(numberresult) + 1);
2360 numberresult[0] = '0';
2361 numberresult[1] = 'x';
2362 numprinted += 2;
2363 }
2364 n += numprinted;
2365 numberresult += (numprinted + 1);
2366 assert(*(numberresult - 1) == '\0');
2367 assert(*(numberresult - 2) != '\0');
2368 assert(numprinted >= 0);
2369 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002370 break;
2371 case 's':
2372 {
2373 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002374 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002375 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2376 if (!str)
2377 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 /* since PyUnicode_DecodeUTF8 returns already flexible
2379 unicode objects, there is no need to call ready on them */
2380 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002381 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002383 /* Remember the str and switch to the next slot */
2384 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002385 break;
2386 }
2387 case 'U':
2388 {
2389 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002390 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 if (PyUnicode_READY(obj) == -1)
2392 goto fail;
2393 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002394 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002395 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002396 break;
2397 }
2398 case 'V':
2399 {
2400 PyObject *obj = va_arg(count, PyObject *);
2401 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002402 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002403 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002404 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002405 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002406 if (PyUnicode_READY(obj) == -1)
2407 goto fail;
2408 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002409 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002410 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002411 *callresult++ = NULL;
2412 }
2413 else {
2414 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2415 if (!str_obj)
2416 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002417 if (PyUnicode_READY(str_obj)) {
2418 Py_DECREF(str_obj);
2419 goto fail;
2420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002421 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002422 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002423 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002424 *callresult++ = str_obj;
2425 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002426 break;
2427 }
2428 case 'S':
2429 {
2430 PyObject *obj = va_arg(count, PyObject *);
2431 PyObject *str;
2432 assert(obj);
2433 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002435 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002436 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002437 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002439 /* Remember the str and switch to the next slot */
2440 *callresult++ = str;
2441 break;
2442 }
2443 case 'R':
2444 {
2445 PyObject *obj = va_arg(count, PyObject *);
2446 PyObject *repr;
2447 assert(obj);
2448 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002450 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002452 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002454 /* Remember the repr and switch to the next slot */
2455 *callresult++ = repr;
2456 break;
2457 }
2458 case 'A':
2459 {
2460 PyObject *obj = va_arg(count, PyObject *);
2461 PyObject *ascii;
2462 assert(obj);
2463 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002465 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002467 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002469 /* Remember the repr and switch to the next slot */
2470 *callresult++ = ascii;
2471 break;
2472 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002473 default:
2474 /* if we stumble upon an unknown
2475 formatting code, copy the rest of
2476 the format string to the output
2477 string. (we cannot just skip the
2478 code, since there's no way to know
2479 what's in the argument list) */
2480 n += strlen(p);
2481 goto expand;
2482 }
2483 } else
2484 n++;
2485 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002486 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002487 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002488 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002489 we don't have to resize the string.
2490 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002491 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 if (!string)
2493 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 kind = PyUnicode_KIND(string);
2495 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002496 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002500 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002501 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002502
2503 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2505 /* checking for == because the last argument could be a empty
2506 string, which causes i to point to end, the assert at the end of
2507 the loop */
2508 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002509
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 switch (*f) {
2511 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002512 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002513 const int ordinal = va_arg(vargs, int);
2514 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002516 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002517 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002518 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002519 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002521 case 'p':
2522 /* unused, since we already have the result */
2523 if (*f == 'p')
2524 (void) va_arg(vargs, void *);
2525 else
2526 (void) va_arg(vargs, int);
2527 /* extract the result from numberresults and append. */
2528 for (; *numberresult; ++i, ++numberresult)
2529 PyUnicode_WRITE(kind, data, i, *numberresult);
2530 /* skip over the separating '\0' */
2531 assert(*numberresult == '\0');
2532 numberresult++;
2533 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002534 break;
2535 case 's':
2536 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002537 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002539 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 size = PyUnicode_GET_LENGTH(*callresult);
2541 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002542 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002544 /* We're done with the unicode()/repr() => forget it */
2545 Py_DECREF(*callresult);
2546 /* switch to next unicode()/repr() result */
2547 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002548 break;
2549 }
2550 case 'U':
2551 {
2552 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 Py_ssize_t size;
2554 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2555 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002556 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002557 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 break;
2559 }
2560 case 'V':
2561 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002564 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002565 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002566 size = PyUnicode_GET_LENGTH(obj);
2567 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002568 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002569 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 size = PyUnicode_GET_LENGTH(*callresult);
2572 assert(PyUnicode_KIND(*callresult) <=
2573 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002574 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002576 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002577 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002578 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002579 break;
2580 }
2581 case 'S':
2582 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002583 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002585 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 /* unused, since we already have the result */
2587 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002588 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002589 copy_characters(string, i, *callresult, 0, size);
2590 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 /* We're done with the unicode()/repr() => forget it */
2592 Py_DECREF(*callresult);
2593 /* switch to next unicode()/repr() result */
2594 ++callresult;
2595 break;
2596 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002597 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002599 break;
2600 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002601 for (; *p; ++p, ++i)
2602 PyUnicode_WRITE(kind, data, i, *p);
2603 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 goto end;
2605 }
Victor Stinner1205f272010-09-11 00:54:47 +00002606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002607 else {
2608 assert(i < PyUnicode_GET_LENGTH(string));
2609 PyUnicode_WRITE(kind, data, i++, *f);
2610 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002613
Benjamin Peterson29060642009-01-31 22:14:21 +00002614 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002615 if (callresults)
2616 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 if (numberresults)
2618 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002619 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002621 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002622 if (callresults) {
2623 PyObject **callresult2 = callresults;
2624 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002625 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002626 ++callresult2;
2627 }
2628 PyObject_Free(callresults);
2629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002630 if (numberresults)
2631 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002632 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002633}
2634
Walter Dörwaldd2034312007-05-18 16:29:38 +00002635PyObject *
2636PyUnicode_FromFormat(const char *format, ...)
2637{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002638 PyObject* ret;
2639 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640
2641#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002642 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002643#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002644 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002645#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002646 ret = PyUnicode_FromFormatV(format, vargs);
2647 va_end(vargs);
2648 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002649}
2650
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002651#ifdef HAVE_WCHAR_H
2652
Victor Stinner5593d8a2010-10-02 11:11:27 +00002653/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2654 convert a Unicode object to a wide character string.
2655
Victor Stinnerd88d9832011-09-06 02:00:05 +02002656 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002657 character) required to convert the unicode object. Ignore size argument.
2658
Victor Stinnerd88d9832011-09-06 02:00:05 +02002659 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002660 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002661 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002662static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002663unicode_aswidechar(PyUnicodeObject *unicode,
2664 wchar_t *w,
2665 Py_ssize_t size)
2666{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002667 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 const wchar_t *wstr;
2669
2670 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2671 if (wstr == NULL)
2672 return -1;
2673
Victor Stinner5593d8a2010-10-02 11:11:27 +00002674 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002675 if (size > res)
2676 size = res + 1;
2677 else
2678 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002679 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680 return res;
2681 }
2682 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002684}
2685
2686Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002687PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002688 wchar_t *w,
2689 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690{
2691 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002692 PyErr_BadInternalCall();
2693 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002694 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002695 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002696}
2697
Victor Stinner137c34c2010-09-29 10:25:54 +00002698wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002699PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002700 Py_ssize_t *size)
2701{
2702 wchar_t* buffer;
2703 Py_ssize_t buflen;
2704
2705 if (unicode == NULL) {
2706 PyErr_BadInternalCall();
2707 return NULL;
2708 }
2709
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002710 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 if (buflen == -1)
2712 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002713 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002714 PyErr_NoMemory();
2715 return NULL;
2716 }
2717
Victor Stinner137c34c2010-09-29 10:25:54 +00002718 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2719 if (buffer == NULL) {
2720 PyErr_NoMemory();
2721 return NULL;
2722 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002723 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002724 if (buflen == -1)
2725 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002726 if (size != NULL)
2727 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002728 return buffer;
2729}
2730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002731#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732
Alexander Belopolsky40018472011-02-26 01:02:56 +00002733PyObject *
2734PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002735{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002737 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002738 PyErr_SetString(PyExc_ValueError,
2739 "chr() arg not in range(0x110000)");
2740 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002741 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002742
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002743 if (ordinal < 256)
2744 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002746 v = PyUnicode_New(1, ordinal);
2747 if (v == NULL)
2748 return NULL;
2749 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002750 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002752}
2753
Alexander Belopolsky40018472011-02-26 01:02:56 +00002754PyObject *
2755PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002756{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002757 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002758 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002759 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002760 if (PyUnicode_READY(obj))
2761 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 Py_INCREF(obj);
2763 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002764 }
2765 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002766 /* For a Unicode subtype that's not a Unicode object,
2767 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002768 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002769 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002770 PyErr_Format(PyExc_TypeError,
2771 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002772 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002773 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002774}
2775
Alexander Belopolsky40018472011-02-26 01:02:56 +00002776PyObject *
2777PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002778 const char *encoding,
2779 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002780{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002781 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002782 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002783
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002785 PyErr_BadInternalCall();
2786 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002787 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002788
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002789 /* Decoding bytes objects is the most common case and should be fast */
2790 if (PyBytes_Check(obj)) {
2791 if (PyBytes_GET_SIZE(obj) == 0) {
2792 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002793 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002794 }
2795 else {
2796 v = PyUnicode_Decode(
2797 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2798 encoding, errors);
2799 }
2800 return v;
2801 }
2802
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002803 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002804 PyErr_SetString(PyExc_TypeError,
2805 "decoding str is not supported");
2806 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002807 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002809 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2810 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2811 PyErr_Format(PyExc_TypeError,
2812 "coercing to str: need bytes, bytearray "
2813 "or buffer-like object, %.80s found",
2814 Py_TYPE(obj)->tp_name);
2815 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002816 }
Tim Petersced69f82003-09-16 20:30:58 +00002817
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002818 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002819 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002820 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Tim Petersced69f82003-09-16 20:30:58 +00002822 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002824
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002825 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002826 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827}
2828
Victor Stinner600d3be2010-06-10 12:00:55 +00002829/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002830 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2831 1 on success. */
2832static int
2833normalize_encoding(const char *encoding,
2834 char *lower,
2835 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002837 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002838 char *l;
2839 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002840
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002841 e = encoding;
2842 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002843 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002844 while (*e) {
2845 if (l == l_end)
2846 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002847 if (Py_ISUPPER(*e)) {
2848 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002849 }
2850 else if (*e == '_') {
2851 *l++ = '-';
2852 e++;
2853 }
2854 else {
2855 *l++ = *e++;
2856 }
2857 }
2858 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002859 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002860}
2861
Alexander Belopolsky40018472011-02-26 01:02:56 +00002862PyObject *
2863PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002864 Py_ssize_t size,
2865 const char *encoding,
2866 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002867{
2868 PyObject *buffer = NULL, *unicode;
2869 Py_buffer info;
2870 char lower[11]; /* Enough for any encoding shortcut */
2871
2872 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002873 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002874
2875 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002876 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002877 if ((strcmp(lower, "utf-8") == 0) ||
2878 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002879 return PyUnicode_DecodeUTF8(s, size, errors);
2880 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002881 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002882 (strcmp(lower, "iso-8859-1") == 0))
2883 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002884#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002885 else if (strcmp(lower, "mbcs") == 0)
2886 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002887#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002888 else if (strcmp(lower, "ascii") == 0)
2889 return PyUnicode_DecodeASCII(s, size, errors);
2890 else if (strcmp(lower, "utf-16") == 0)
2891 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2892 else if (strcmp(lower, "utf-32") == 0)
2893 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895
2896 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002897 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002898 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002899 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002900 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 if (buffer == NULL)
2902 goto onError;
2903 unicode = PyCodec_Decode(buffer, encoding, errors);
2904 if (unicode == NULL)
2905 goto onError;
2906 if (!PyUnicode_Check(unicode)) {
2907 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002908 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002909 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910 Py_DECREF(unicode);
2911 goto onError;
2912 }
2913 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002914#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002915 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002916 Py_DECREF(unicode);
2917 return NULL;
2918 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002919#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002920 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002922
Benjamin Peterson29060642009-01-31 22:14:21 +00002923 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002924 Py_XDECREF(buffer);
2925 return NULL;
2926}
2927
Alexander Belopolsky40018472011-02-26 01:02:56 +00002928PyObject *
2929PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002930 const char *encoding,
2931 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002932{
2933 PyObject *v;
2934
2935 if (!PyUnicode_Check(unicode)) {
2936 PyErr_BadArgument();
2937 goto onError;
2938 }
2939
2940 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002941 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002942
2943 /* Decode via the codec registry */
2944 v = PyCodec_Decode(unicode, encoding, errors);
2945 if (v == NULL)
2946 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002947 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002948 return v;
2949
Benjamin Peterson29060642009-01-31 22:14:21 +00002950 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002951 return NULL;
2952}
2953
Alexander Belopolsky40018472011-02-26 01:02:56 +00002954PyObject *
2955PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002956 const char *encoding,
2957 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002958{
2959 PyObject *v;
2960
2961 if (!PyUnicode_Check(unicode)) {
2962 PyErr_BadArgument();
2963 goto onError;
2964 }
2965
2966 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002967 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002968
2969 /* Decode via the codec registry */
2970 v = PyCodec_Decode(unicode, encoding, errors);
2971 if (v == NULL)
2972 goto onError;
2973 if (!PyUnicode_Check(v)) {
2974 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002975 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002976 Py_TYPE(v)->tp_name);
2977 Py_DECREF(v);
2978 goto onError;
2979 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002980 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002981 return v;
2982
Benjamin Peterson29060642009-01-31 22:14:21 +00002983 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002984 return NULL;
2985}
2986
Alexander Belopolsky40018472011-02-26 01:02:56 +00002987PyObject *
2988PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002989 Py_ssize_t size,
2990 const char *encoding,
2991 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992{
2993 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002994
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 unicode = PyUnicode_FromUnicode(s, size);
2996 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2999 Py_DECREF(unicode);
3000 return v;
3001}
3002
Alexander Belopolsky40018472011-02-26 01:02:56 +00003003PyObject *
3004PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003005 const char *encoding,
3006 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003007{
3008 PyObject *v;
3009
3010 if (!PyUnicode_Check(unicode)) {
3011 PyErr_BadArgument();
3012 goto onError;
3013 }
3014
3015 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003016 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003017
3018 /* Encode via the codec registry */
3019 v = PyCodec_Encode(unicode, encoding, errors);
3020 if (v == NULL)
3021 goto onError;
3022 return v;
3023
Benjamin Peterson29060642009-01-31 22:14:21 +00003024 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003025 return NULL;
3026}
3027
Victor Stinnerad158722010-10-27 00:25:46 +00003028PyObject *
3029PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003030{
Victor Stinner99b95382011-07-04 14:23:54 +02003031#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003032 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3033 PyUnicode_GET_SIZE(unicode),
3034 NULL);
3035#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003036 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003037#else
Victor Stinner793b5312011-04-27 00:24:21 +02003038 PyInterpreterState *interp = PyThreadState_GET()->interp;
3039 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3040 cannot use it to encode and decode filenames before it is loaded. Load
3041 the Python codec requires to encode at least its own filename. Use the C
3042 version of the locale codec until the codec registry is initialized and
3043 the Python codec is loaded.
3044
3045 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3046 cannot only rely on it: check also interp->fscodec_initialized for
3047 subinterpreters. */
3048 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003049 return PyUnicode_AsEncodedString(unicode,
3050 Py_FileSystemDefaultEncoding,
3051 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003052 }
3053 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003054 /* locale encoding with surrogateescape */
3055 wchar_t *wchar;
3056 char *bytes;
3057 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003058 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003059
3060 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3061 if (wchar == NULL)
3062 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003063 bytes = _Py_wchar2char(wchar, &error_pos);
3064 if (bytes == NULL) {
3065 if (error_pos != (size_t)-1) {
3066 char *errmsg = strerror(errno);
3067 PyObject *exc = NULL;
3068 if (errmsg == NULL)
3069 errmsg = "Py_wchar2char() failed";
3070 raise_encode_exception(&exc,
3071 "filesystemencoding",
3072 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3073 error_pos, error_pos+1,
3074 errmsg);
3075 Py_XDECREF(exc);
3076 }
3077 else
3078 PyErr_NoMemory();
3079 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003080 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003081 }
3082 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003083
3084 bytes_obj = PyBytes_FromString(bytes);
3085 PyMem_Free(bytes);
3086 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003087 }
Victor Stinnerad158722010-10-27 00:25:46 +00003088#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003089}
3090
Alexander Belopolsky40018472011-02-26 01:02:56 +00003091PyObject *
3092PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003093 const char *encoding,
3094 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095{
3096 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003097 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003098
Guido van Rossumd57fd912000-03-10 22:53:23 +00003099 if (!PyUnicode_Check(unicode)) {
3100 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102 }
Fred Drakee4315f52000-05-09 19:53:39 +00003103
Victor Stinner2f283c22011-03-02 01:21:46 +00003104 if (encoding == NULL) {
3105 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003106 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003107 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00003109 }
Fred Drakee4315f52000-05-09 19:53:39 +00003110
3111 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003112 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003113 if ((strcmp(lower, "utf-8") == 0) ||
3114 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003115 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003116 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003117 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003118 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003119 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003120 }
Victor Stinner37296e82010-06-10 13:36:23 +00003121 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003122 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003123 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003124 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003125#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003126 else if (strcmp(lower, "mbcs") == 0)
3127 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3128 PyUnicode_GET_SIZE(unicode),
3129 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003130#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003131 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003132 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003134
3135 /* Encode via the codec registry */
3136 v = PyCodec_Encode(unicode, encoding, errors);
3137 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003138 return NULL;
3139
3140 /* The normal path */
3141 if (PyBytes_Check(v))
3142 return v;
3143
3144 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003145 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003146 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003147 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003148
3149 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3150 "encoder %s returned bytearray instead of bytes",
3151 encoding);
3152 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003153 Py_DECREF(v);
3154 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003155 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003156
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003157 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3158 Py_DECREF(v);
3159 return b;
3160 }
3161
3162 PyErr_Format(PyExc_TypeError,
3163 "encoder did not return a bytes object (type=%.400s)",
3164 Py_TYPE(v)->tp_name);
3165 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003166 return NULL;
3167}
3168
Alexander Belopolsky40018472011-02-26 01:02:56 +00003169PyObject *
3170PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003171 const char *encoding,
3172 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003173{
3174 PyObject *v;
3175
3176 if (!PyUnicode_Check(unicode)) {
3177 PyErr_BadArgument();
3178 goto onError;
3179 }
3180
3181 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003182 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003183
3184 /* Encode via the codec registry */
3185 v = PyCodec_Encode(unicode, encoding, errors);
3186 if (v == NULL)
3187 goto onError;
3188 if (!PyUnicode_Check(v)) {
3189 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003190 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003191 Py_TYPE(v)->tp_name);
3192 Py_DECREF(v);
3193 goto onError;
3194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003196
Benjamin Peterson29060642009-01-31 22:14:21 +00003197 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 return NULL;
3199}
3200
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003201PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003202PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003203 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003204 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3205}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003206
Christian Heimes5894ba72007-11-04 11:43:14 +00003207PyObject*
3208PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3209{
Victor Stinner99b95382011-07-04 14:23:54 +02003210#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003211 return PyUnicode_DecodeMBCS(s, size, NULL);
3212#elif defined(__APPLE__)
3213 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3214#else
Victor Stinner793b5312011-04-27 00:24:21 +02003215 PyInterpreterState *interp = PyThreadState_GET()->interp;
3216 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3217 cannot use it to encode and decode filenames before it is loaded. Load
3218 the Python codec requires to encode at least its own filename. Use the C
3219 version of the locale codec until the codec registry is initialized and
3220 the Python codec is loaded.
3221
3222 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3223 cannot only rely on it: check also interp->fscodec_initialized for
3224 subinterpreters. */
3225 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003226 return PyUnicode_Decode(s, size,
3227 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003228 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003229 }
3230 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003231 /* locale encoding with surrogateescape */
3232 wchar_t *wchar;
3233 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003234 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003235
3236 if (s[size] != '\0' || size != strlen(s)) {
3237 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3238 return NULL;
3239 }
3240
Victor Stinner168e1172010-10-16 23:16:16 +00003241 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003242 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003243 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003244
Victor Stinner168e1172010-10-16 23:16:16 +00003245 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003246 PyMem_Free(wchar);
3247 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003248 }
Victor Stinnerad158722010-10-27 00:25:46 +00003249#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003250}
3251
Martin v. Löwis011e8422009-05-05 04:43:17 +00003252
3253int
3254PyUnicode_FSConverter(PyObject* arg, void* addr)
3255{
3256 PyObject *output = NULL;
3257 Py_ssize_t size;
3258 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003259 if (arg == NULL) {
3260 Py_DECREF(*(PyObject**)addr);
3261 return 1;
3262 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003263 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003264 output = arg;
3265 Py_INCREF(output);
3266 }
3267 else {
3268 arg = PyUnicode_FromObject(arg);
3269 if (!arg)
3270 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003271 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003272 Py_DECREF(arg);
3273 if (!output)
3274 return 0;
3275 if (!PyBytes_Check(output)) {
3276 Py_DECREF(output);
3277 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3278 return 0;
3279 }
3280 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003281 size = PyBytes_GET_SIZE(output);
3282 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003283 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003284 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003285 Py_DECREF(output);
3286 return 0;
3287 }
3288 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003289 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003290}
3291
3292
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003293int
3294PyUnicode_FSDecoder(PyObject* arg, void* addr)
3295{
3296 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003297 if (arg == NULL) {
3298 Py_DECREF(*(PyObject**)addr);
3299 return 1;
3300 }
3301 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003302 if (PyUnicode_READY(arg))
3303 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003304 output = arg;
3305 Py_INCREF(output);
3306 }
3307 else {
3308 arg = PyBytes_FromObject(arg);
3309 if (!arg)
3310 return 0;
3311 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3312 PyBytes_GET_SIZE(arg));
3313 Py_DECREF(arg);
3314 if (!output)
3315 return 0;
3316 if (!PyUnicode_Check(output)) {
3317 Py_DECREF(output);
3318 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3319 return 0;
3320 }
3321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003322 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003323 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003324 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3325 Py_DECREF(output);
3326 return 0;
3327 }
3328 *(PyObject**)addr = output;
3329 return Py_CLEANUP_SUPPORTED;
3330}
3331
3332
Martin v. Löwis5b222132007-06-10 09:51:05 +00003333char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003334PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003335{
Christian Heimesf3863112007-11-22 07:46:41 +00003336 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3338
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003339 if (!PyUnicode_Check(unicode)) {
3340 PyErr_BadArgument();
3341 return NULL;
3342 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003344 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003346 if (PyUnicode_UTF8(unicode) == NULL) {
3347 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3349 if (bytes == NULL)
3350 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003351 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3352 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003353 Py_DECREF(bytes);
3354 return NULL;
3355 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003356 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3357 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 Py_DECREF(bytes);
3359 }
3360
3361 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003362 *psize = PyUnicode_UTF8_LENGTH(unicode);
3363 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003364}
3365
3366char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003367PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003369 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3370}
3371
3372#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003373static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003374#endif
3375
3376
3377Py_UNICODE *
3378PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3379{
3380 PyUnicodeObject *u;
3381 const unsigned char *one_byte;
3382#if SIZEOF_WCHAR_T == 4
3383 const Py_UCS2 *two_bytes;
3384#else
3385 const Py_UCS4 *four_bytes;
3386 const Py_UCS4 *ucs4_end;
3387 Py_ssize_t num_surrogates;
3388#endif
3389 wchar_t *w;
3390 wchar_t *wchar_end;
3391
3392 if (!PyUnicode_Check(unicode)) {
3393 PyErr_BadArgument();
3394 return NULL;
3395 }
3396 u = (PyUnicodeObject*)unicode;
3397 if (_PyUnicode_WSTR(u) == NULL) {
3398 /* Non-ASCII compact unicode object */
3399 assert(_PyUnicode_KIND(u) != 0);
3400 assert(PyUnicode_IS_READY(u));
3401
3402#ifdef Py_DEBUG
3403 ++unicode_as_unicode_calls;
3404#endif
3405
3406 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3407#if SIZEOF_WCHAR_T == 2
3408 four_bytes = PyUnicode_4BYTE_DATA(u);
3409 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3410 num_surrogates = 0;
3411
3412 for (; four_bytes < ucs4_end; ++four_bytes) {
3413 if (*four_bytes > 0xFFFF)
3414 ++num_surrogates;
3415 }
3416
3417 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3418 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3419 if (!_PyUnicode_WSTR(u)) {
3420 PyErr_NoMemory();
3421 return NULL;
3422 }
3423 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3424
3425 w = _PyUnicode_WSTR(u);
3426 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3427 four_bytes = PyUnicode_4BYTE_DATA(u);
3428 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3429 if (*four_bytes > 0xFFFF) {
3430 /* encode surrogate pair in this case */
3431 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3432 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3433 }
3434 else
3435 *w = *four_bytes;
3436
3437 if (w > wchar_end) {
3438 assert(0 && "Miscalculated string end");
3439 }
3440 }
3441 *w = 0;
3442#else
3443 /* sizeof(wchar_t) == 4 */
3444 Py_FatalError("Impossible unicode object state, wstr and str "
3445 "should share memory already.");
3446 return NULL;
3447#endif
3448 }
3449 else {
3450 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3451 (_PyUnicode_LENGTH(u) + 1));
3452 if (!_PyUnicode_WSTR(u)) {
3453 PyErr_NoMemory();
3454 return NULL;
3455 }
3456 if (!PyUnicode_IS_COMPACT_ASCII(u))
3457 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3458 w = _PyUnicode_WSTR(u);
3459 wchar_end = w + _PyUnicode_LENGTH(u);
3460
3461 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3462 one_byte = PyUnicode_1BYTE_DATA(u);
3463 for (; w < wchar_end; ++one_byte, ++w)
3464 *w = *one_byte;
3465 /* null-terminate the wstr */
3466 *w = 0;
3467 }
3468 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3469#if SIZEOF_WCHAR_T == 4
3470 two_bytes = PyUnicode_2BYTE_DATA(u);
3471 for (; w < wchar_end; ++two_bytes, ++w)
3472 *w = *two_bytes;
3473 /* null-terminate the wstr */
3474 *w = 0;
3475#else
3476 /* sizeof(wchar_t) == 2 */
3477 PyObject_FREE(_PyUnicode_WSTR(u));
3478 _PyUnicode_WSTR(u) = NULL;
3479 Py_FatalError("Impossible unicode object state, wstr "
3480 "and str should share memory already.");
3481 return NULL;
3482#endif
3483 }
3484 else {
3485 assert(0 && "This should never happen.");
3486 }
3487 }
3488 }
3489 if (size != NULL)
3490 *size = PyUnicode_WSTR_LENGTH(u);
3491 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003492}
3493
Alexander Belopolsky40018472011-02-26 01:02:56 +00003494Py_UNICODE *
3495PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003497 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003498}
3499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500
Alexander Belopolsky40018472011-02-26 01:02:56 +00003501Py_ssize_t
3502PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503{
3504 if (!PyUnicode_Check(unicode)) {
3505 PyErr_BadArgument();
3506 goto onError;
3507 }
3508 return PyUnicode_GET_SIZE(unicode);
3509
Benjamin Peterson29060642009-01-31 22:14:21 +00003510 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003511 return -1;
3512}
3513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514Py_ssize_t
3515PyUnicode_GetLength(PyObject *unicode)
3516{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003517 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003518 PyErr_BadArgument();
3519 return -1;
3520 }
3521
3522 return PyUnicode_GET_LENGTH(unicode);
3523}
3524
3525Py_UCS4
3526PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3527{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003528 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3529 PyErr_BadArgument();
3530 return (Py_UCS4)-1;
3531 }
3532 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3533 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003534 return (Py_UCS4)-1;
3535 }
3536 return PyUnicode_READ_CHAR(unicode, index);
3537}
3538
3539int
3540PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3541{
3542 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003543 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003544 return -1;
3545 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003546 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3547 PyErr_SetString(PyExc_IndexError, "string index out of range");
3548 return -1;
3549 }
3550 if (_PyUnicode_Dirty(unicode))
3551 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003552 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3553 index, ch);
3554 return 0;
3555}
3556
Alexander Belopolsky40018472011-02-26 01:02:56 +00003557const char *
3558PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003559{
Victor Stinner42cb4622010-09-01 19:39:01 +00003560 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003561}
3562
Victor Stinner554f3f02010-06-16 23:33:54 +00003563/* create or adjust a UnicodeDecodeError */
3564static void
3565make_decode_exception(PyObject **exceptionObject,
3566 const char *encoding,
3567 const char *input, Py_ssize_t length,
3568 Py_ssize_t startpos, Py_ssize_t endpos,
3569 const char *reason)
3570{
3571 if (*exceptionObject == NULL) {
3572 *exceptionObject = PyUnicodeDecodeError_Create(
3573 encoding, input, length, startpos, endpos, reason);
3574 }
3575 else {
3576 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3577 goto onError;
3578 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3579 goto onError;
3580 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3581 goto onError;
3582 }
3583 return;
3584
3585onError:
3586 Py_DECREF(*exceptionObject);
3587 *exceptionObject = NULL;
3588}
3589
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003590/* error handling callback helper:
3591 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003592 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593 and adjust various state variables.
3594 return 0 on success, -1 on error
3595*/
3596
Alexander Belopolsky40018472011-02-26 01:02:56 +00003597static int
3598unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003599 const char *encoding, const char *reason,
3600 const char **input, const char **inend, Py_ssize_t *startinpos,
3601 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3602 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003603{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003604 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003605
3606 PyObject *restuple = NULL;
3607 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003608 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003609 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003610 Py_ssize_t requiredsize;
3611 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003612 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003613 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003614 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003615 int res = -1;
3616
3617 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003618 *errorHandler = PyCodec_LookupError(errors);
3619 if (*errorHandler == NULL)
3620 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 }
3622
Victor Stinner554f3f02010-06-16 23:33:54 +00003623 make_decode_exception(exceptionObject,
3624 encoding,
3625 *input, *inend - *input,
3626 *startinpos, *endinpos,
3627 reason);
3628 if (*exceptionObject == NULL)
3629 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003630
3631 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3632 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003633 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003634 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003635 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 }
3638 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003640
3641 /* Copy back the bytes variables, which might have been modified by the
3642 callback */
3643 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3644 if (!inputobj)
3645 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003646 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003648 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003649 *input = PyBytes_AS_STRING(inputobj);
3650 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003651 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003652 /* we can DECREF safely, as the exception has another reference,
3653 so the object won't go away. */
3654 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003656 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003657 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003658 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003659 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3660 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003662
3663 /* need more space? (at least enough for what we
3664 have+the replacement+the rest of the string (starting
3665 at the new input position), so we won't have to check space
3666 when there are no errors in the rest of the string) */
3667 repptr = PyUnicode_AS_UNICODE(repunicode);
3668 repsize = PyUnicode_GET_SIZE(repunicode);
3669 requiredsize = *outpos + repsize + insize-newpos;
3670 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 if (requiredsize<2*outsize)
3672 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003673 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 goto onError;
3675 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 }
3677 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003678 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 Py_UNICODE_COPY(*outptr, repptr, repsize);
3680 *outptr += repsize;
3681 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003683 /* we made it! */
3684 res = 0;
3685
Benjamin Peterson29060642009-01-31 22:14:21 +00003686 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687 Py_XDECREF(restuple);
3688 return res;
3689}
3690
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691/* --- UTF-7 Codec -------------------------------------------------------- */
3692
Antoine Pitrou244651a2009-05-04 18:56:13 +00003693/* See RFC2152 for details. We encode conservatively and decode liberally. */
3694
3695/* Three simple macros defining base-64. */
3696
3697/* Is c a base-64 character? */
3698
3699#define IS_BASE64(c) \
3700 (((c) >= 'A' && (c) <= 'Z') || \
3701 ((c) >= 'a' && (c) <= 'z') || \
3702 ((c) >= '0' && (c) <= '9') || \
3703 (c) == '+' || (c) == '/')
3704
3705/* given that c is a base-64 character, what is its base-64 value? */
3706
3707#define FROM_BASE64(c) \
3708 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3709 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3710 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3711 (c) == '+' ? 62 : 63)
3712
3713/* What is the base-64 character of the bottom 6 bits of n? */
3714
3715#define TO_BASE64(n) \
3716 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3717
3718/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3719 * decoded as itself. We are permissive on decoding; the only ASCII
3720 * byte not decoding to itself is the + which begins a base64
3721 * string. */
3722
3723#define DECODE_DIRECT(c) \
3724 ((c) <= 127 && (c) != '+')
3725
3726/* The UTF-7 encoder treats ASCII characters differently according to
3727 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3728 * the above). See RFC2152. This array identifies these different
3729 * sets:
3730 * 0 : "Set D"
3731 * alphanumeric and '(),-./:?
3732 * 1 : "Set O"
3733 * !"#$%&*;<=>@[]^_`{|}
3734 * 2 : "whitespace"
3735 * ht nl cr sp
3736 * 3 : special (must be base64 encoded)
3737 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3738 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003739
Tim Petersced69f82003-09-16 20:30:58 +00003740static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003741char utf7_category[128] = {
3742/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3743 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3744/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3745 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3746/* sp ! " # $ % & ' ( ) * + , - . / */
3747 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3748/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3749 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3750/* @ A B C D E F G H I J K L M N O */
3751 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3752/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3753 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3754/* ` a b c d e f g h i j k l m n o */
3755 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3756/* p q r s t u v w x y z { | } ~ del */
3757 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758};
3759
Antoine Pitrou244651a2009-05-04 18:56:13 +00003760/* ENCODE_DIRECT: this character should be encoded as itself. The
3761 * answer depends on whether we are encoding set O as itself, and also
3762 * on whether we are encoding whitespace as itself. RFC2152 makes it
3763 * clear that the answers to these questions vary between
3764 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003765
Antoine Pitrou244651a2009-05-04 18:56:13 +00003766#define ENCODE_DIRECT(c, directO, directWS) \
3767 ((c) < 128 && (c) > 0 && \
3768 ((utf7_category[(c)] == 0) || \
3769 (directWS && (utf7_category[(c)] == 2)) || \
3770 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771
Alexander Belopolsky40018472011-02-26 01:02:56 +00003772PyObject *
3773PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003774 Py_ssize_t size,
3775 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003776{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003777 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3778}
3779
Antoine Pitrou244651a2009-05-04 18:56:13 +00003780/* The decoder. The only state we preserve is our read position,
3781 * i.e. how many characters we have consumed. So if we end in the
3782 * middle of a shift sequence we have to back off the read position
3783 * and the output to the beginning of the sequence, otherwise we lose
3784 * all the shift state (seen bits, number of bits seen, high
3785 * surrogate). */
3786
Alexander Belopolsky40018472011-02-26 01:02:56 +00003787PyObject *
3788PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003789 Py_ssize_t size,
3790 const char *errors,
3791 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003793 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003794 Py_ssize_t startinpos;
3795 Py_ssize_t endinpos;
3796 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003797 const char *e;
3798 PyUnicodeObject *unicode;
3799 Py_UNICODE *p;
3800 const char *errmsg = "";
3801 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003802 Py_UNICODE *shiftOutStart;
3803 unsigned int base64bits = 0;
3804 unsigned long base64buffer = 0;
3805 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003806 PyObject *errorHandler = NULL;
3807 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003808
3809 unicode = _PyUnicode_New(size);
3810 if (!unicode)
3811 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003812 if (size == 0) {
3813 if (consumed)
3814 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003815 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003816 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003818 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003819 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820 e = s + size;
3821
3822 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003823 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003824 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003825 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003826
Antoine Pitrou244651a2009-05-04 18:56:13 +00003827 if (inShift) { /* in a base-64 section */
3828 if (IS_BASE64(ch)) { /* consume a base-64 character */
3829 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3830 base64bits += 6;
3831 s++;
3832 if (base64bits >= 16) {
3833 /* we have enough bits for a UTF-16 value */
3834 Py_UNICODE outCh = (Py_UNICODE)
3835 (base64buffer >> (base64bits-16));
3836 base64bits -= 16;
3837 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3838 if (surrogate) {
3839 /* expecting a second surrogate */
3840 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3841#ifdef Py_UNICODE_WIDE
3842 *p++ = (((surrogate & 0x3FF)<<10)
3843 | (outCh & 0x3FF)) + 0x10000;
3844#else
3845 *p++ = surrogate;
3846 *p++ = outCh;
3847#endif
3848 surrogate = 0;
3849 }
3850 else {
3851 surrogate = 0;
3852 errmsg = "second surrogate missing";
3853 goto utf7Error;
3854 }
3855 }
3856 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3857 /* first surrogate */
3858 surrogate = outCh;
3859 }
3860 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3861 errmsg = "unexpected second surrogate";
3862 goto utf7Error;
3863 }
3864 else {
3865 *p++ = outCh;
3866 }
3867 }
3868 }
3869 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003870 inShift = 0;
3871 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003872 if (surrogate) {
3873 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003874 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003876 if (base64bits > 0) { /* left-over bits */
3877 if (base64bits >= 6) {
3878 /* We've seen at least one base-64 character */
3879 errmsg = "partial character in shift sequence";
3880 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003881 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003882 else {
3883 /* Some bits remain; they should be zero */
3884 if (base64buffer != 0) {
3885 errmsg = "non-zero padding bits in shift sequence";
3886 goto utf7Error;
3887 }
3888 }
3889 }
3890 if (ch != '-') {
3891 /* '-' is absorbed; other terminating
3892 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003893 *p++ = ch;
3894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003895 }
3896 }
3897 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003898 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003899 s++; /* consume '+' */
3900 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003901 s++;
3902 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003903 }
3904 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003906 shiftOutStart = p;
3907 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 }
3909 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003910 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003911 *p++ = ch;
3912 s++;
3913 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 else {
3915 startinpos = s-starts;
3916 s++;
3917 errmsg = "unexpected special character";
3918 goto utf7Error;
3919 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003920 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003921utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003922 outpos = p-PyUnicode_AS_UNICODE(unicode);
3923 endinpos = s-starts;
3924 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003925 errors, &errorHandler,
3926 "utf7", errmsg,
3927 &starts, &e, &startinpos, &endinpos, &exc, &s,
3928 &unicode, &outpos, &p))
3929 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003930 }
3931
Antoine Pitrou244651a2009-05-04 18:56:13 +00003932 /* end of string */
3933
3934 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3935 /* if we're in an inconsistent state, that's an error */
3936 if (surrogate ||
3937 (base64bits >= 6) ||
3938 (base64bits > 0 && base64buffer != 0)) {
3939 outpos = p-PyUnicode_AS_UNICODE(unicode);
3940 endinpos = size;
3941 if (unicode_decode_call_errorhandler(
3942 errors, &errorHandler,
3943 "utf7", "unterminated shift sequence",
3944 &starts, &e, &startinpos, &endinpos, &exc, &s,
3945 &unicode, &outpos, &p))
3946 goto onError;
3947 if (s < e)
3948 goto restart;
3949 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003950 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003951
3952 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003953 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003954 if (inShift) {
3955 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003956 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003957 }
3958 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003959 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003961 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003962
Victor Stinnerfe226c02011-10-03 03:52:20 +02003963 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003964 goto onError;
3965
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003966 Py_XDECREF(errorHandler);
3967 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003968#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003969 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970 Py_DECREF(unicode);
3971 return NULL;
3972 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003973#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003974 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 return (PyObject *)unicode;
3976
Benjamin Peterson29060642009-01-31 22:14:21 +00003977 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978 Py_XDECREF(errorHandler);
3979 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 Py_DECREF(unicode);
3981 return NULL;
3982}
3983
3984
Alexander Belopolsky40018472011-02-26 01:02:56 +00003985PyObject *
3986PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003987 Py_ssize_t size,
3988 int base64SetO,
3989 int base64WhiteSpace,
3990 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003991{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003992 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003993 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003994 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003995 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003996 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997 unsigned int base64bits = 0;
3998 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003999 char * out;
4000 char * start;
4001
4002 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004003 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004004
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00004005 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004006 return PyErr_NoMemory();
4007
Antoine Pitrou244651a2009-05-04 18:56:13 +00004008 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004009 if (v == NULL)
4010 return NULL;
4011
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004012 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004013 for (;i < size; ++i) {
4014 Py_UNICODE ch = s[i];
4015
Antoine Pitrou244651a2009-05-04 18:56:13 +00004016 if (inShift) {
4017 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4018 /* shifting out */
4019 if (base64bits) { /* output remaining bits */
4020 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4021 base64buffer = 0;
4022 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004023 }
4024 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004025 /* Characters not in the BASE64 set implicitly unshift the sequence
4026 so no '-' is required, except if the character is itself a '-' */
4027 if (IS_BASE64(ch) || ch == '-') {
4028 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004029 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004030 *out++ = (char) ch;
4031 }
4032 else {
4033 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004034 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004035 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004036 else { /* not in a shift sequence */
4037 if (ch == '+') {
4038 *out++ = '+';
4039 *out++ = '-';
4040 }
4041 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4042 *out++ = (char) ch;
4043 }
4044 else {
4045 *out++ = '+';
4046 inShift = 1;
4047 goto encode_char;
4048 }
4049 }
4050 continue;
4051encode_char:
4052#ifdef Py_UNICODE_WIDE
4053 if (ch >= 0x10000) {
4054 /* code first surrogate */
4055 base64bits += 16;
4056 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4057 while (base64bits >= 6) {
4058 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4059 base64bits -= 6;
4060 }
4061 /* prepare second surrogate */
4062 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4063 }
4064#endif
4065 base64bits += 16;
4066 base64buffer = (base64buffer << 16) | ch;
4067 while (base64bits >= 6) {
4068 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4069 base64bits -= 6;
4070 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004071 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004072 if (base64bits)
4073 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4074 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004075 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004076 if (_PyBytes_Resize(&v, out - start) < 0)
4077 return NULL;
4078 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004079}
4080
Antoine Pitrou244651a2009-05-04 18:56:13 +00004081#undef IS_BASE64
4082#undef FROM_BASE64
4083#undef TO_BASE64
4084#undef DECODE_DIRECT
4085#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004086
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087/* --- UTF-8 Codec -------------------------------------------------------- */
4088
Tim Petersced69f82003-09-16 20:30:58 +00004089static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004091 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4092 illegal prefix. See RFC 3629 for details */
4093 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4094 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004095 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004102 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4105 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4106 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4107 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4108 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109};
4110
Alexander Belopolsky40018472011-02-26 01:02:56 +00004111PyObject *
4112PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004113 Py_ssize_t size,
4114 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
Walter Dörwald69652032004-09-07 20:24:22 +00004116 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4117}
4118
Antoine Pitrouab868312009-01-10 15:40:25 +00004119/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4120#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4121
4122/* Mask to quickly check whether a C 'long' contains a
4123 non-ASCII, UTF8-encoded char. */
4124#if (SIZEOF_LONG == 8)
4125# define ASCII_CHAR_MASK 0x8080808080808080L
4126#elif (SIZEOF_LONG == 4)
4127# define ASCII_CHAR_MASK 0x80808080L
4128#else
4129# error C 'long' size should be either 4 or 8!
4130#endif
4131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132/* Scans a UTF-8 string and returns the maximum character to be expected,
4133 the size of the decoded unicode string and if any major errors were
4134 encountered.
4135
4136 This function does check basic UTF-8 sanity, it does however NOT CHECK
4137 if the string contains surrogates, and if all continuation bytes are
4138 within the correct ranges, these checks are performed in
4139 PyUnicode_DecodeUTF8Stateful.
4140
4141 If it sets has_errors to 1, it means the value of unicode_size and max_char
4142 will be bogus and you should not rely on useful information in them.
4143 */
4144static Py_UCS4
4145utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4146 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4147 int *has_errors)
4148{
4149 Py_ssize_t n;
4150 Py_ssize_t char_count = 0;
4151 Py_UCS4 max_char = 127, new_max;
4152 Py_UCS4 upper_bound;
4153 const unsigned char *p = (const unsigned char *)s;
4154 const unsigned char *end = p + string_size;
4155 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4156 int err = 0;
4157
4158 for (; p < end && !err; ++p, ++char_count) {
4159 /* Only check value if it's not a ASCII char... */
4160 if (*p < 0x80) {
4161 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4162 an explanation. */
4163 if (!((size_t) p & LONG_PTR_MASK)) {
4164 /* Help register allocation */
4165 register const unsigned char *_p = p;
4166 while (_p < aligned_end) {
4167 unsigned long value = *(unsigned long *) _p;
4168 if (value & ASCII_CHAR_MASK)
4169 break;
4170 _p += SIZEOF_LONG;
4171 char_count += SIZEOF_LONG;
4172 }
4173 p = _p;
4174 if (p == end)
4175 break;
4176 }
4177 }
4178 if (*p >= 0x80) {
4179 n = utf8_code_length[*p];
4180 new_max = max_char;
4181 switch (n) {
4182 /* invalid start byte */
4183 case 0:
4184 err = 1;
4185 break;
4186 case 2:
4187 /* Code points between 0x00FF and 0x07FF inclusive.
4188 Approximate the upper bound of the code point,
4189 if this flips over 255 we can be sure it will be more
4190 than 255 and the string will need 2 bytes per code coint,
4191 if it stays under or equal to 255, we can be sure 1 byte
4192 is enough.
4193 ((*p & 0b00011111) << 6) | 0b00111111 */
4194 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4195 if (max_char < upper_bound)
4196 new_max = upper_bound;
4197 /* Ensure we track at least that we left ASCII space. */
4198 if (new_max < 128)
4199 new_max = 128;
4200 break;
4201 case 3:
4202 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4203 always > 255 and <= 65535 and will always need 2 bytes. */
4204 if (max_char < 65535)
4205 new_max = 65535;
4206 break;
4207 case 4:
4208 /* Code point will be above 0xFFFF for sure in this case. */
4209 new_max = 65537;
4210 break;
4211 /* Internal error, this should be caught by the first if */
4212 case 1:
4213 default:
4214 assert(0 && "Impossible case in utf8_max_char_and_size");
4215 err = 1;
4216 }
4217 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004218 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 --n;
4220 /* Check if the follow up chars are all valid continuation bytes */
4221 if (n >= 1) {
4222 const unsigned char *cont;
4223 if ((p + n) >= end) {
4224 if (consumed == 0)
4225 /* incomplete data, non-incremental decoding */
4226 err = 1;
4227 break;
4228 }
4229 for (cont = p + 1; cont < (p + n); ++cont) {
4230 if ((*cont & 0xc0) != 0x80) {
4231 err = 1;
4232 break;
4233 }
4234 }
4235 p += n;
4236 }
4237 else
4238 err = 1;
4239 max_char = new_max;
4240 }
4241 }
4242
4243 if (unicode_size)
4244 *unicode_size = char_count;
4245 if (has_errors)
4246 *has_errors = err;
4247 return max_char;
4248}
4249
4250/* Similar to PyUnicode_WRITE but can also write into wstr field
4251 of the legacy unicode representation */
4252#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4253 do { \
4254 const int k_ = (kind); \
4255 if (k_ == PyUnicode_WCHAR_KIND) \
4256 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4257 else if (k_ == PyUnicode_1BYTE_KIND) \
4258 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4259 else if (k_ == PyUnicode_2BYTE_KIND) \
4260 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4261 else \
4262 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4263 } while (0)
4264
Alexander Belopolsky40018472011-02-26 01:02:56 +00004265PyObject *
4266PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 Py_ssize_t size,
4268 const char *errors,
4269 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004270{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004271 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004273 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004274 Py_ssize_t startinpos;
4275 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004276 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004278 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004279 PyObject *errorHandler = NULL;
4280 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004281 Py_UCS4 maxchar = 0;
4282 Py_ssize_t unicode_size;
4283 Py_ssize_t i;
4284 int kind;
4285 void *data;
4286 int has_errors;
4287 Py_UNICODE *error_outptr;
4288#if SIZEOF_WCHAR_T == 2
4289 Py_ssize_t wchar_offset = 0;
4290#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004291
Walter Dörwald69652032004-09-07 20:24:22 +00004292 if (size == 0) {
4293 if (consumed)
4294 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004295 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4298 consumed, &has_errors);
4299 if (has_errors) {
4300 unicode = _PyUnicode_New(size);
4301 if (!unicode)
4302 return NULL;
4303 kind = PyUnicode_WCHAR_KIND;
4304 data = PyUnicode_AS_UNICODE(unicode);
4305 assert(data != NULL);
4306 }
4307 else {
4308 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4309 if (!unicode)
4310 return NULL;
4311 /* When the string is ASCII only, just use memcpy and return.
4312 unicode_size may be != size if there is an incomplete UTF-8
4313 sequence at the end of the ASCII block. */
4314 if (maxchar < 128 && size == unicode_size) {
4315 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4316 return (PyObject *)unicode;
4317 }
4318 kind = PyUnicode_KIND(unicode);
4319 data = PyUnicode_DATA(unicode);
4320 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004324 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
4326 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004327 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
4329 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004330 /* Fast path for runs of ASCII characters. Given that common UTF-8
4331 input will consist of an overwhelming majority of ASCII
4332 characters, we try to optimize for this case by checking
4333 as many characters as a C 'long' can contain.
4334 First, check if we can do an aligned read, as most CPUs have
4335 a penalty for unaligned reads.
4336 */
4337 if (!((size_t) s & LONG_PTR_MASK)) {
4338 /* Help register allocation */
4339 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004340 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004341 while (_s < aligned_end) {
4342 /* Read a whole long at a time (either 4 or 8 bytes),
4343 and do a fast unrolled copy if it only contains ASCII
4344 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004345 unsigned long value = *(unsigned long *) _s;
4346 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004347 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4349 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4350 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4351 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004352#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4354 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4355 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4356 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004357#endif
4358 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004359 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004360 }
4361 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004363 if (s == e)
4364 break;
4365 ch = (unsigned char)*s;
4366 }
4367 }
4368
4369 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004370 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371 s++;
4372 continue;
4373 }
4374
4375 n = utf8_code_length[ch];
4376
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004377 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004378 if (consumed)
4379 break;
4380 else {
4381 errmsg = "unexpected end of data";
4382 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004383 endinpos = startinpos+1;
4384 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4385 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004386 goto utf8Error;
4387 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004388 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389
4390 switch (n) {
4391
4392 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004393 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 startinpos = s-starts;
4395 endinpos = startinpos+1;
4396 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397
4398 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004399 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 startinpos = s-starts;
4401 endinpos = startinpos+1;
4402 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403
4404 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004405 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004406 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004407 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004408 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004409 goto utf8Error;
4410 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004412 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004413 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 break;
4415
4416 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004417 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4418 will result in surrogates in range d800-dfff. Surrogates are
4419 not valid UTF-8 so they are rejected.
4420 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4421 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004422 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004423 (s[2] & 0xc0) != 0x80 ||
4424 ((unsigned char)s[0] == 0xE0 &&
4425 (unsigned char)s[1] < 0xA0) ||
4426 ((unsigned char)s[0] == 0xED &&
4427 (unsigned char)s[1] > 0x9F)) {
4428 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004430 endinpos = startinpos + 1;
4431
4432 /* if s[1] first two bits are 1 and 0, then the invalid
4433 continuation byte is s[2], so increment endinpos by 1,
4434 if not, s[1] is invalid and endinpos doesn't need to
4435 be incremented. */
4436 if ((s[1] & 0xC0) == 0x80)
4437 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004438 goto utf8Error;
4439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004442 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004443 break;
4444
4445 case 4:
4446 if ((s[1] & 0xc0) != 0x80 ||
4447 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004448 (s[3] & 0xc0) != 0x80 ||
4449 ((unsigned char)s[0] == 0xF0 &&
4450 (unsigned char)s[1] < 0x90) ||
4451 ((unsigned char)s[0] == 0xF4 &&
4452 (unsigned char)s[1] > 0x8F)) {
4453 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004454 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004455 endinpos = startinpos + 1;
4456 if ((s[1] & 0xC0) == 0x80) {
4457 endinpos++;
4458 if ((s[2] & 0xC0) == 0x80)
4459 endinpos++;
4460 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004461 goto utf8Error;
4462 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004463 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004464 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4465 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4466
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004467 /* If the string is flexible or we have native UCS-4, write
4468 directly.. */
4469 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4470 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004472 else {
4473 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 /* translate from 10000..10FFFF to 0..FFFF */
4476 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 /* high surrogate = top 10 bits added to D800 */
4479 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4480 (Py_UNICODE)(0xD800 + (ch >> 10)));
4481
4482 /* low surrogate = bottom 10 bits added to DC00 */
4483 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4484 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4485 }
4486#if SIZEOF_WCHAR_T == 2
4487 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004488#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004489 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 }
4491 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004492 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004493
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004495 /* If this is not yet a resizable string, make it one.. */
4496 if (kind != PyUnicode_WCHAR_KIND) {
4497 const Py_UNICODE *u;
4498 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4499 if (!new_unicode)
4500 goto onError;
4501 u = PyUnicode_AsUnicode((PyObject *)unicode);
4502 if (!u)
4503 goto onError;
4504#if SIZEOF_WCHAR_T == 2
4505 i += wchar_offset;
4506#endif
4507 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4508 Py_DECREF(unicode);
4509 unicode = new_unicode;
4510 kind = 0;
4511 data = PyUnicode_AS_UNICODE(new_unicode);
4512 assert(data != NULL);
4513 }
4514 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004515 if (unicode_decode_call_errorhandler(
4516 errors, &errorHandler,
4517 "utf8", errmsg,
4518 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004519 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004521 /* Update data because unicode_decode_call_errorhandler might have
4522 re-created or resized the unicode object. */
4523 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004524 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004526 /* Ensure the unicode_size calculation above was correct: */
4527 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4528
Walter Dörwald69652032004-09-07 20:24:22 +00004529 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004530 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004532 /* Adjust length and ready string when it contained errors and
4533 is of the old resizable kind. */
4534 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004535 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004536 goto onError;
4537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004538
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004539 Py_XDECREF(errorHandler);
4540 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004541#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004542 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004543 Py_DECREF(unicode);
4544 return NULL;
4545 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004546#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004547 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 return (PyObject *)unicode;
4549
Benjamin Peterson29060642009-01-31 22:14:21 +00004550 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004551 Py_XDECREF(errorHandler);
4552 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004553 Py_DECREF(unicode);
4554 return NULL;
4555}
4556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004557#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004558
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004559#ifdef __APPLE__
4560
4561/* Simplified UTF-8 decoder using surrogateescape error handler,
4562 used to decode the command line arguments on Mac OS X. */
4563
4564wchar_t*
4565_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4566{
4567 int n;
4568 const char *e;
4569 wchar_t *unicode, *p;
4570
4571 /* Note: size will always be longer than the resulting Unicode
4572 character count */
4573 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4574 PyErr_NoMemory();
4575 return NULL;
4576 }
4577 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4578 if (!unicode)
4579 return NULL;
4580
4581 /* Unpack UTF-8 encoded data */
4582 p = unicode;
4583 e = s + size;
4584 while (s < e) {
4585 Py_UCS4 ch = (unsigned char)*s;
4586
4587 if (ch < 0x80) {
4588 *p++ = (wchar_t)ch;
4589 s++;
4590 continue;
4591 }
4592
4593 n = utf8_code_length[ch];
4594 if (s + n > e) {
4595 goto surrogateescape;
4596 }
4597
4598 switch (n) {
4599 case 0:
4600 case 1:
4601 goto surrogateescape;
4602
4603 case 2:
4604 if ((s[1] & 0xc0) != 0x80)
4605 goto surrogateescape;
4606 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4607 assert ((ch > 0x007F) && (ch <= 0x07FF));
4608 *p++ = (wchar_t)ch;
4609 break;
4610
4611 case 3:
4612 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4613 will result in surrogates in range d800-dfff. Surrogates are
4614 not valid UTF-8 so they are rejected.
4615 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4616 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4617 if ((s[1] & 0xc0) != 0x80 ||
4618 (s[2] & 0xc0) != 0x80 ||
4619 ((unsigned char)s[0] == 0xE0 &&
4620 (unsigned char)s[1] < 0xA0) ||
4621 ((unsigned char)s[0] == 0xED &&
4622 (unsigned char)s[1] > 0x9F)) {
4623
4624 goto surrogateescape;
4625 }
4626 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4627 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004628 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004629 break;
4630
4631 case 4:
4632 if ((s[1] & 0xc0) != 0x80 ||
4633 (s[2] & 0xc0) != 0x80 ||
4634 (s[3] & 0xc0) != 0x80 ||
4635 ((unsigned char)s[0] == 0xF0 &&
4636 (unsigned char)s[1] < 0x90) ||
4637 ((unsigned char)s[0] == 0xF4 &&
4638 (unsigned char)s[1] > 0x8F)) {
4639 goto surrogateescape;
4640 }
4641 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4642 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4643 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4644
4645#if SIZEOF_WCHAR_T == 4
4646 *p++ = (wchar_t)ch;
4647#else
4648 /* compute and append the two surrogates: */
4649
4650 /* translate from 10000..10FFFF to 0..FFFF */
4651 ch -= 0x10000;
4652
4653 /* high surrogate = top 10 bits added to D800 */
4654 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4655
4656 /* low surrogate = bottom 10 bits added to DC00 */
4657 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4658#endif
4659 break;
4660 }
4661 s += n;
4662 continue;
4663
4664 surrogateescape:
4665 *p++ = 0xDC00 + ch;
4666 s++;
4667 }
4668 *p = L'\0';
4669 return unicode;
4670}
4671
4672#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004673
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004674/* Primary internal function which creates utf8 encoded bytes objects.
4675
4676 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004677 and allocate exactly as much space needed at the end. Else allocate the
4678 maximum possible needed (4 result bytes per Unicode character), and return
4679 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004680*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004681PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004682_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004683{
Tim Peters602f7402002-04-27 18:03:26 +00004684#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004685
Guido van Rossum98297ee2007-11-06 21:34:58 +00004686 Py_ssize_t i; /* index into s of next input byte */
4687 PyObject *result; /* result string object */
4688 char *p; /* next free byte in output buffer */
4689 Py_ssize_t nallocated; /* number of result bytes allocated */
4690 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004691 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004692 PyObject *errorHandler = NULL;
4693 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004694 int kind;
4695 void *data;
4696 Py_ssize_t size;
4697 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4698#if SIZEOF_WCHAR_T == 2
4699 Py_ssize_t wchar_offset = 0;
4700#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004702 if (!PyUnicode_Check(unicode)) {
4703 PyErr_BadArgument();
4704 return NULL;
4705 }
4706
4707 if (PyUnicode_READY(unicode) == -1)
4708 return NULL;
4709
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004710 if (PyUnicode_UTF8(unicode))
4711 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4712 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004713
4714 kind = PyUnicode_KIND(unicode);
4715 data = PyUnicode_DATA(unicode);
4716 size = PyUnicode_GET_LENGTH(unicode);
4717
Tim Peters602f7402002-04-27 18:03:26 +00004718 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719
Tim Peters602f7402002-04-27 18:03:26 +00004720 if (size <= MAX_SHORT_UNICHARS) {
4721 /* Write into the stack buffer; nallocated can't overflow.
4722 * At the end, we'll allocate exactly as much heap space as it
4723 * turns out we need.
4724 */
4725 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004726 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004727 p = stackbuf;
4728 }
4729 else {
4730 /* Overallocate on the heap, and give the excess back at the end. */
4731 nallocated = size * 4;
4732 if (nallocated / 4 != size) /* overflow! */
4733 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004734 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004735 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004736 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004737 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004738 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004739
Tim Peters602f7402002-04-27 18:03:26 +00004740 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004741 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004742
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004743 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004744 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004748 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004749 *p++ = (char)(0xc0 | (ch >> 6));
4750 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004751 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004752 Py_ssize_t newpos;
4753 PyObject *rep;
4754 Py_ssize_t repsize, k, startpos;
4755 startpos = i-1;
4756#if SIZEOF_WCHAR_T == 2
4757 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004758#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004759 rep = unicode_encode_call_errorhandler(
4760 errors, &errorHandler, "utf-8", "surrogates not allowed",
4761 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4762 &exc, startpos, startpos+1, &newpos);
4763 if (!rep)
4764 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004765
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004766 if (PyBytes_Check(rep))
4767 repsize = PyBytes_GET_SIZE(rep);
4768 else
4769 repsize = PyUnicode_GET_SIZE(rep);
4770
4771 if (repsize > 4) {
4772 Py_ssize_t offset;
4773
4774 if (result == NULL)
4775 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004776 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004777 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004779 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4780 /* integer overflow */
4781 PyErr_NoMemory();
4782 goto error;
4783 }
4784 nallocated += repsize - 4;
4785 if (result != NULL) {
4786 if (_PyBytes_Resize(&result, nallocated) < 0)
4787 goto error;
4788 } else {
4789 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004790 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004791 goto error;
4792 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4793 }
4794 p = PyBytes_AS_STRING(result) + offset;
4795 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 if (PyBytes_Check(rep)) {
4798 char *prep = PyBytes_AS_STRING(rep);
4799 for(k = repsize; k > 0; k--)
4800 *p++ = *prep++;
4801 } else /* rep is unicode */ {
4802 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4803 Py_UNICODE c;
4804
4805 for(k=0; k<repsize; k++) {
4806 c = prep[k];
4807 if (0x80 <= c) {
4808 raise_encode_exception(&exc, "utf-8",
4809 PyUnicode_AS_UNICODE(unicode),
4810 size, i-1, i,
4811 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004812 goto error;
4813 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004815 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004818 } else if (ch < 0x10000) {
4819 *p++ = (char)(0xe0 | (ch >> 12));
4820 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4821 *p++ = (char)(0x80 | (ch & 0x3f));
4822 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004823 /* Encode UCS4 Unicode ordinals */
4824 *p++ = (char)(0xf0 | (ch >> 18));
4825 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4826 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4827 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828#if SIZEOF_WCHAR_T == 2
4829 wchar_offset++;
4830#endif
Tim Peters602f7402002-04-27 18:03:26 +00004831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004833
Guido van Rossum98297ee2007-11-06 21:34:58 +00004834 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004835 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004836 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004837 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004838 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004839 }
4840 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004841 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004842 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004843 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004844 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004845 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004847 Py_XDECREF(errorHandler);
4848 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004849 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004850 error:
4851 Py_XDECREF(errorHandler);
4852 Py_XDECREF(exc);
4853 Py_XDECREF(result);
4854 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004855
Tim Peters602f7402002-04-27 18:03:26 +00004856#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857}
4858
Alexander Belopolsky40018472011-02-26 01:02:56 +00004859PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004860PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4861 Py_ssize_t size,
4862 const char *errors)
4863{
4864 PyObject *v, *unicode;
4865
4866 unicode = PyUnicode_FromUnicode(s, size);
4867 if (unicode == NULL)
4868 return NULL;
4869 v = _PyUnicode_AsUTF8String(unicode, errors);
4870 Py_DECREF(unicode);
4871 return v;
4872}
4873
4874PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004875PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004877 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878}
4879
Walter Dörwald41980ca2007-08-16 21:55:45 +00004880/* --- UTF-32 Codec ------------------------------------------------------- */
4881
4882PyObject *
4883PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004884 Py_ssize_t size,
4885 const char *errors,
4886 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004887{
4888 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4889}
4890
4891PyObject *
4892PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 Py_ssize_t size,
4894 const char *errors,
4895 int *byteorder,
4896 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004897{
4898 const char *starts = s;
4899 Py_ssize_t startinpos;
4900 Py_ssize_t endinpos;
4901 Py_ssize_t outpos;
4902 PyUnicodeObject *unicode;
4903 Py_UNICODE *p;
4904#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004905 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004906 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004907#else
4908 const int pairs = 0;
4909#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004910 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004911 int bo = 0; /* assume native ordering by default */
4912 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004913 /* Offsets from q for retrieving bytes in the right order. */
4914#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4915 int iorder[] = {0, 1, 2, 3};
4916#else
4917 int iorder[] = {3, 2, 1, 0};
4918#endif
4919 PyObject *errorHandler = NULL;
4920 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004921
Walter Dörwald41980ca2007-08-16 21:55:45 +00004922 q = (unsigned char *)s;
4923 e = q + size;
4924
4925 if (byteorder)
4926 bo = *byteorder;
4927
4928 /* Check for BOM marks (U+FEFF) in the input and adjust current
4929 byte order setting accordingly. In native mode, the leading BOM
4930 mark is skipped, in all other modes, it is copied to the output
4931 stream as-is (giving a ZWNBSP character). */
4932 if (bo == 0) {
4933 if (size >= 4) {
4934 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004935 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004936#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004937 if (bom == 0x0000FEFF) {
4938 q += 4;
4939 bo = -1;
4940 }
4941 else if (bom == 0xFFFE0000) {
4942 q += 4;
4943 bo = 1;
4944 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004946 if (bom == 0x0000FEFF) {
4947 q += 4;
4948 bo = 1;
4949 }
4950 else if (bom == 0xFFFE0000) {
4951 q += 4;
4952 bo = -1;
4953 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004954#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004955 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 }
4957
4958 if (bo == -1) {
4959 /* force LE */
4960 iorder[0] = 0;
4961 iorder[1] = 1;
4962 iorder[2] = 2;
4963 iorder[3] = 3;
4964 }
4965 else if (bo == 1) {
4966 /* force BE */
4967 iorder[0] = 3;
4968 iorder[1] = 2;
4969 iorder[2] = 1;
4970 iorder[3] = 0;
4971 }
4972
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004973 /* On narrow builds we split characters outside the BMP into two
4974 codepoints => count how much extra space we need. */
4975#ifndef Py_UNICODE_WIDE
4976 for (qq = q; qq < e; qq += 4)
4977 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4978 pairs++;
4979#endif
4980
4981 /* This might be one to much, because of a BOM */
4982 unicode = _PyUnicode_New((size+3)/4+pairs);
4983 if (!unicode)
4984 return NULL;
4985 if (size == 0)
4986 return (PyObject *)unicode;
4987
4988 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004989 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004990
Walter Dörwald41980ca2007-08-16 21:55:45 +00004991 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004992 Py_UCS4 ch;
4993 /* remaining bytes at the end? (size should be divisible by 4) */
4994 if (e-q<4) {
4995 if (consumed)
4996 break;
4997 errmsg = "truncated data";
4998 startinpos = ((const char *)q)-starts;
4999 endinpos = ((const char *)e)-starts;
5000 goto utf32Error;
5001 /* The remaining input chars are ignored if the callback
5002 chooses to skip the input */
5003 }
5004 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5005 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005006
Benjamin Peterson29060642009-01-31 22:14:21 +00005007 if (ch >= 0x110000)
5008 {
5009 errmsg = "codepoint not in range(0x110000)";
5010 startinpos = ((const char *)q)-starts;
5011 endinpos = startinpos+4;
5012 goto utf32Error;
5013 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005014#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 if (ch >= 0x10000)
5016 {
5017 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5018 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5019 }
5020 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005021#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005022 *p++ = ch;
5023 q += 4;
5024 continue;
5025 utf32Error:
5026 outpos = p-PyUnicode_AS_UNICODE(unicode);
5027 if (unicode_decode_call_errorhandler(
5028 errors, &errorHandler,
5029 "utf32", errmsg,
5030 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5031 &unicode, &outpos, &p))
5032 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005033 }
5034
5035 if (byteorder)
5036 *byteorder = bo;
5037
5038 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005039 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040
5041 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005042 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043 goto onError;
5044
5045 Py_XDECREF(errorHandler);
5046 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005047#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005048 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005049 Py_DECREF(unicode);
5050 return NULL;
5051 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005052#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005053 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054 return (PyObject *)unicode;
5055
Benjamin Peterson29060642009-01-31 22:14:21 +00005056 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 Py_DECREF(unicode);
5058 Py_XDECREF(errorHandler);
5059 Py_XDECREF(exc);
5060 return NULL;
5061}
5062
5063PyObject *
5064PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 Py_ssize_t size,
5066 const char *errors,
5067 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005068{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005069 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005070 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005071 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005073 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005074#else
5075 const int pairs = 0;
5076#endif
5077 /* Offsets from p for storing byte pairs in the right order. */
5078#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5079 int iorder[] = {0, 1, 2, 3};
5080#else
5081 int iorder[] = {3, 2, 1, 0};
5082#endif
5083
Benjamin Peterson29060642009-01-31 22:14:21 +00005084#define STORECHAR(CH) \
5085 do { \
5086 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5087 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5088 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5089 p[iorder[0]] = (CH) & 0xff; \
5090 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005091 } while(0)
5092
5093 /* In narrow builds we can output surrogate pairs as one codepoint,
5094 so we need less space. */
5095#ifndef Py_UNICODE_WIDE
5096 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005097 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5098 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5099 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005101 nsize = (size - pairs + (byteorder == 0));
5102 bytesize = nsize * 4;
5103 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005104 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005105 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005106 if (v == NULL)
5107 return NULL;
5108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005109 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005110 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005111 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005112 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005113 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114
5115 if (byteorder == -1) {
5116 /* force LE */
5117 iorder[0] = 0;
5118 iorder[1] = 1;
5119 iorder[2] = 2;
5120 iorder[3] = 3;
5121 }
5122 else if (byteorder == 1) {
5123 /* force BE */
5124 iorder[0] = 3;
5125 iorder[1] = 2;
5126 iorder[2] = 1;
5127 iorder[3] = 0;
5128 }
5129
5130 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005131 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005132#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005133 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5134 Py_UCS4 ch2 = *s;
5135 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5136 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5137 s++;
5138 size--;
5139 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005140 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005141#endif
5142 STORECHAR(ch);
5143 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005144
5145 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005146 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005147#undef STORECHAR
5148}
5149
Alexander Belopolsky40018472011-02-26 01:02:56 +00005150PyObject *
5151PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005152{
5153 if (!PyUnicode_Check(unicode)) {
5154 PyErr_BadArgument();
5155 return NULL;
5156 }
5157 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005158 PyUnicode_GET_SIZE(unicode),
5159 NULL,
5160 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005161}
5162
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163/* --- UTF-16 Codec ------------------------------------------------------- */
5164
Tim Peters772747b2001-08-09 22:21:55 +00005165PyObject *
5166PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005167 Py_ssize_t size,
5168 const char *errors,
5169 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170{
Walter Dörwald69652032004-09-07 20:24:22 +00005171 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5172}
5173
Antoine Pitrouab868312009-01-10 15:40:25 +00005174/* Two masks for fast checking of whether a C 'long' may contain
5175 UTF16-encoded surrogate characters. This is an efficient heuristic,
5176 assuming that non-surrogate characters with a code point >= 0x8000 are
5177 rare in most input.
5178 FAST_CHAR_MASK is used when the input is in native byte ordering,
5179 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005180*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005181#if (SIZEOF_LONG == 8)
5182# define FAST_CHAR_MASK 0x8000800080008000L
5183# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5184#elif (SIZEOF_LONG == 4)
5185# define FAST_CHAR_MASK 0x80008000L
5186# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5187#else
5188# error C 'long' size should be either 4 or 8!
5189#endif
5190
Walter Dörwald69652032004-09-07 20:24:22 +00005191PyObject *
5192PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005193 Py_ssize_t size,
5194 const char *errors,
5195 int *byteorder,
5196 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005197{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005198 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005199 Py_ssize_t startinpos;
5200 Py_ssize_t endinpos;
5201 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 PyUnicodeObject *unicode;
5203 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005204 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005205 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005207 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005208 /* Offsets from q for retrieving byte pairs in the right order. */
5209#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5210 int ihi = 1, ilo = 0;
5211#else
5212 int ihi = 0, ilo = 1;
5213#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005214 PyObject *errorHandler = NULL;
5215 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216
5217 /* Note: size will always be longer than the resulting Unicode
5218 character count */
5219 unicode = _PyUnicode_New(size);
5220 if (!unicode)
5221 return NULL;
5222 if (size == 0)
5223 return (PyObject *)unicode;
5224
5225 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005226 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005227 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005228 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229
5230 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005231 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005233 /* Check for BOM marks (U+FEFF) in the input and adjust current
5234 byte order setting accordingly. In native mode, the leading BOM
5235 mark is skipped, in all other modes, it is copied to the output
5236 stream as-is (giving a ZWNBSP character). */
5237 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005238 if (size >= 2) {
5239 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005240#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005241 if (bom == 0xFEFF) {
5242 q += 2;
5243 bo = -1;
5244 }
5245 else if (bom == 0xFFFE) {
5246 q += 2;
5247 bo = 1;
5248 }
Tim Petersced69f82003-09-16 20:30:58 +00005249#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005250 if (bom == 0xFEFF) {
5251 q += 2;
5252 bo = 1;
5253 }
5254 else if (bom == 0xFFFE) {
5255 q += 2;
5256 bo = -1;
5257 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005258#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005259 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005261
Tim Peters772747b2001-08-09 22:21:55 +00005262 if (bo == -1) {
5263 /* force LE */
5264 ihi = 1;
5265 ilo = 0;
5266 }
5267 else if (bo == 1) {
5268 /* force BE */
5269 ihi = 0;
5270 ilo = 1;
5271 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005272#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5273 native_ordering = ilo < ihi;
5274#else
5275 native_ordering = ilo > ihi;
5276#endif
Tim Peters772747b2001-08-09 22:21:55 +00005277
Antoine Pitrouab868312009-01-10 15:40:25 +00005278 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005279 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005280 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005281 /* First check for possible aligned read of a C 'long'. Unaligned
5282 reads are more expensive, better to defer to another iteration. */
5283 if (!((size_t) q & LONG_PTR_MASK)) {
5284 /* Fast path for runs of non-surrogate chars. */
5285 register const unsigned char *_q = q;
5286 Py_UNICODE *_p = p;
5287 if (native_ordering) {
5288 /* Native ordering is simple: as long as the input cannot
5289 possibly contain a surrogate char, do an unrolled copy
5290 of several 16-bit code points to the target object.
5291 The non-surrogate check is done on several input bytes
5292 at a time (as many as a C 'long' can contain). */
5293 while (_q < aligned_end) {
5294 unsigned long data = * (unsigned long *) _q;
5295 if (data & FAST_CHAR_MASK)
5296 break;
5297 _p[0] = ((unsigned short *) _q)[0];
5298 _p[1] = ((unsigned short *) _q)[1];
5299#if (SIZEOF_LONG == 8)
5300 _p[2] = ((unsigned short *) _q)[2];
5301 _p[3] = ((unsigned short *) _q)[3];
5302#endif
5303 _q += SIZEOF_LONG;
5304 _p += SIZEOF_LONG / 2;
5305 }
5306 }
5307 else {
5308 /* Byteswapped ordering is similar, but we must decompose
5309 the copy bytewise, and take care of zero'ing out the
5310 upper bytes if the target object is in 32-bit units
5311 (that is, in UCS-4 builds). */
5312 while (_q < aligned_end) {
5313 unsigned long data = * (unsigned long *) _q;
5314 if (data & SWAPPED_FAST_CHAR_MASK)
5315 break;
5316 /* Zero upper bytes in UCS-4 builds */
5317#if (Py_UNICODE_SIZE > 2)
5318 _p[0] = 0;
5319 _p[1] = 0;
5320#if (SIZEOF_LONG == 8)
5321 _p[2] = 0;
5322 _p[3] = 0;
5323#endif
5324#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005325 /* Issue #4916; UCS-4 builds on big endian machines must
5326 fill the two last bytes of each 4-byte unit. */
5327#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5328# define OFF 2
5329#else
5330# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005331#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005332 ((unsigned char *) _p)[OFF + 1] = _q[0];
5333 ((unsigned char *) _p)[OFF + 0] = _q[1];
5334 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5335 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5336#if (SIZEOF_LONG == 8)
5337 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5338 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5339 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5340 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5341#endif
5342#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005343 _q += SIZEOF_LONG;
5344 _p += SIZEOF_LONG / 2;
5345 }
5346 }
5347 p = _p;
5348 q = _q;
5349 if (q >= e)
5350 break;
5351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005353
Benjamin Peterson14339b62009-01-31 16:36:08 +00005354 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005355
5356 if (ch < 0xD800 || ch > 0xDFFF) {
5357 *p++ = ch;
5358 continue;
5359 }
5360
5361 /* UTF-16 code pair: */
5362 if (q > e) {
5363 errmsg = "unexpected end of data";
5364 startinpos = (((const char *)q) - 2) - starts;
5365 endinpos = ((const char *)e) + 1 - starts;
5366 goto utf16Error;
5367 }
5368 if (0xD800 <= ch && ch <= 0xDBFF) {
5369 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5370 q += 2;
5371 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005372#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005373 *p++ = ch;
5374 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005375#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005377#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 continue;
5379 }
5380 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005381 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 startinpos = (((const char *)q)-4)-starts;
5383 endinpos = startinpos+2;
5384 goto utf16Error;
5385 }
5386
Benjamin Peterson14339b62009-01-31 16:36:08 +00005387 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005388 errmsg = "illegal encoding";
5389 startinpos = (((const char *)q)-2)-starts;
5390 endinpos = startinpos+2;
5391 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005392
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 utf16Error:
5394 outpos = p - PyUnicode_AS_UNICODE(unicode);
5395 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005396 errors,
5397 &errorHandler,
5398 "utf16", errmsg,
5399 &starts,
5400 (const char **)&e,
5401 &startinpos,
5402 &endinpos,
5403 &exc,
5404 (const char **)&q,
5405 &unicode,
5406 &outpos,
5407 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005408 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005409 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005410 /* remaining byte at the end? (size should be even) */
5411 if (e == q) {
5412 if (!consumed) {
5413 errmsg = "truncated data";
5414 startinpos = ((const char *)q) - starts;
5415 endinpos = ((const char *)e) + 1 - starts;
5416 outpos = p - PyUnicode_AS_UNICODE(unicode);
5417 if (unicode_decode_call_errorhandler(
5418 errors,
5419 &errorHandler,
5420 "utf16", errmsg,
5421 &starts,
5422 (const char **)&e,
5423 &startinpos,
5424 &endinpos,
5425 &exc,
5426 (const char **)&q,
5427 &unicode,
5428 &outpos,
5429 &p))
5430 goto onError;
5431 /* The remaining input chars are ignored if the callback
5432 chooses to skip the input */
5433 }
5434 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
5436 if (byteorder)
5437 *byteorder = bo;
5438
Walter Dörwald69652032004-09-07 20:24:22 +00005439 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005440 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005443 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 goto onError;
5445
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005446 Py_XDECREF(errorHandler);
5447 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005448#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005449 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005450 Py_DECREF(unicode);
5451 return NULL;
5452 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005453#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005454 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455 return (PyObject *)unicode;
5456
Benjamin Peterson29060642009-01-31 22:14:21 +00005457 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005459 Py_XDECREF(errorHandler);
5460 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 return NULL;
5462}
5463
Antoine Pitrouab868312009-01-10 15:40:25 +00005464#undef FAST_CHAR_MASK
5465#undef SWAPPED_FAST_CHAR_MASK
5466
Tim Peters772747b2001-08-09 22:21:55 +00005467PyObject *
5468PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005469 Py_ssize_t size,
5470 const char *errors,
5471 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005473 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005474 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005475 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005476#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005477 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005478#else
5479 const int pairs = 0;
5480#endif
Tim Peters772747b2001-08-09 22:21:55 +00005481 /* Offsets from p for storing byte pairs in the right order. */
5482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5483 int ihi = 1, ilo = 0;
5484#else
5485 int ihi = 0, ilo = 1;
5486#endif
5487
Benjamin Peterson29060642009-01-31 22:14:21 +00005488#define STORECHAR(CH) \
5489 do { \
5490 p[ihi] = ((CH) >> 8) & 0xff; \
5491 p[ilo] = (CH) & 0xff; \
5492 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005493 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005495#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005496 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005497 if (s[i] >= 0x10000)
5498 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005499#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005500 /* 2 * (size + pairs + (byteorder == 0)) */
5501 if (size > PY_SSIZE_T_MAX ||
5502 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005504 nsize = size + pairs + (byteorder == 0);
5505 bytesize = nsize * 2;
5506 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005508 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 if (v == NULL)
5510 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005512 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005515 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005516 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005517
5518 if (byteorder == -1) {
5519 /* force LE */
5520 ihi = 1;
5521 ilo = 0;
5522 }
5523 else if (byteorder == 1) {
5524 /* force BE */
5525 ihi = 0;
5526 ilo = 1;
5527 }
5528
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005529 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005530 Py_UNICODE ch = *s++;
5531 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005532#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 if (ch >= 0x10000) {
5534 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5535 ch = 0xD800 | ((ch-0x10000) >> 10);
5536 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005537#endif
Tim Peters772747b2001-08-09 22:21:55 +00005538 STORECHAR(ch);
5539 if (ch2)
5540 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005541 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005542
5543 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005544 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005545#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546}
5547
Alexander Belopolsky40018472011-02-26 01:02:56 +00005548PyObject *
5549PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550{
5551 if (!PyUnicode_Check(unicode)) {
5552 PyErr_BadArgument();
5553 return NULL;
5554 }
5555 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005556 PyUnicode_GET_SIZE(unicode),
5557 NULL,
5558 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559}
5560
5561/* --- Unicode Escape Codec ----------------------------------------------- */
5562
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005563/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5564 if all the escapes in the string make it still a valid ASCII string.
5565 Returns -1 if any escapes were found which cause the string to
5566 pop out of ASCII range. Otherwise returns the length of the
5567 required buffer to hold the string.
5568 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005569static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5571{
5572 const unsigned char *p = (const unsigned char *)s;
5573 const unsigned char *end = p + size;
5574 Py_ssize_t length = 0;
5575
5576 if (size < 0)
5577 return -1;
5578
5579 for (; p < end; ++p) {
5580 if (*p > 127) {
5581 /* Non-ASCII */
5582 return -1;
5583 }
5584 else if (*p != '\\') {
5585 /* Normal character */
5586 ++length;
5587 }
5588 else {
5589 /* Backslash-escape, check next char */
5590 ++p;
5591 /* Escape sequence reaches till end of string or
5592 non-ASCII follow-up. */
5593 if (p >= end || *p > 127)
5594 return -1;
5595 switch (*p) {
5596 case '\n':
5597 /* backslash + \n result in zero characters */
5598 break;
5599 case '\\': case '\'': case '\"':
5600 case 'b': case 'f': case 't':
5601 case 'n': case 'r': case 'v': case 'a':
5602 ++length;
5603 break;
5604 case '0': case '1': case '2': case '3':
5605 case '4': case '5': case '6': case '7':
5606 case 'x': case 'u': case 'U': case 'N':
5607 /* these do not guarantee ASCII characters */
5608 return -1;
5609 default:
5610 /* count the backslash + the other character */
5611 length += 2;
5612 }
5613 }
5614 }
5615 return length;
5616}
5617
5618/* Similar to PyUnicode_WRITE but either write into wstr field
5619 or treat string as ASCII. */
5620#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5621 do { \
5622 if ((kind) != PyUnicode_WCHAR_KIND) \
5623 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5624 else \
5625 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5626 } while (0)
5627
5628#define WRITE_WSTR(buf, index, value) \
5629 assert(kind == PyUnicode_WCHAR_KIND), \
5630 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5631
5632
Fredrik Lundh06d12682001-01-24 07:59:11 +00005633static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005634
Alexander Belopolsky40018472011-02-26 01:02:56 +00005635PyObject *
5636PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005637 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005638 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005640 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005641 Py_ssize_t startinpos;
5642 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005645 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005646 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005647 char* message;
5648 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005649 PyObject *errorHandler = NULL;
5650 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005651 Py_ssize_t ascii_length;
5652 Py_ssize_t i;
5653 int kind;
5654 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 ascii_length = length_of_escaped_ascii_string(s, size);
5657
5658 /* After length_of_escaped_ascii_string() there are two alternatives,
5659 either the string is pure ASCII with named escapes like \n, etc.
5660 and we determined it's exact size (common case)
5661 or it contains \x, \u, ... escape sequences. then we create a
5662 legacy wchar string and resize it at the end of this function. */
5663 if (ascii_length >= 0) {
5664 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5665 if (!v)
5666 goto onError;
5667 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5668 kind = PyUnicode_1BYTE_KIND;
5669 data = PyUnicode_DATA(v);
5670 }
5671 else {
5672 /* Escaped strings will always be longer than the resulting
5673 Unicode string, so we start with size here and then reduce the
5674 length after conversion to the true value.
5675 (but if the error callback returns a long replacement string
5676 we'll have to allocate more space) */
5677 v = _PyUnicode_New(size);
5678 if (!v)
5679 goto onError;
5680 kind = PyUnicode_WCHAR_KIND;
5681 data = PyUnicode_AS_UNICODE(v);
5682 }
5683
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 if (size == 0)
5685 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 while (s < end) {
5690 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005691 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005692 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 if (kind == PyUnicode_WCHAR_KIND) {
5695 assert(i < _PyUnicode_WSTR_LENGTH(v));
5696 }
5697 else {
5698 /* The only case in which i == ascii_length is a backslash
5699 followed by a newline. */
5700 assert(i <= ascii_length);
5701 }
5702
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 /* Non-escape characters are interpreted as Unicode ordinals */
5704 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005705 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 continue;
5707 }
5708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005709 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 /* \ - Escapes */
5711 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005712 c = *s++;
5713 if (s > end)
5714 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005715
5716 if (kind == PyUnicode_WCHAR_KIND) {
5717 assert(i < _PyUnicode_WSTR_LENGTH(v));
5718 }
5719 else {
5720 /* The only case in which i == ascii_length is a backslash
5721 followed by a newline. */
5722 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5723 }
5724
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005725 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
Benjamin Peterson29060642009-01-31 22:14:21 +00005727 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005729 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5730 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5731 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5732 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5733 /* FF */
5734 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5735 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5736 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5737 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5738 /* VT */
5739 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5740 /* BEL, not classic C */
5741 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 case '0': case '1': case '2': case '3':
5745 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005746 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005747 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005748 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005749 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005750 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005752 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753 break;
5754
Benjamin Peterson29060642009-01-31 22:14:21 +00005755 /* hex escapes */
5756 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005758 digits = 2;
5759 message = "truncated \\xXX escape";
5760 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761
Benjamin Peterson29060642009-01-31 22:14:21 +00005762 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005764 digits = 4;
5765 message = "truncated \\uXXXX escape";
5766 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767
Benjamin Peterson29060642009-01-31 22:14:21 +00005768 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005769 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005770 digits = 8;
5771 message = "truncated \\UXXXXXXXX escape";
5772 hexescape:
5773 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005774 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005775 if (s+digits>end) {
5776 endinpos = size;
5777 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 errors, &errorHandler,
5779 "unicodeescape", "end of string in escape sequence",
5780 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005781 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005782 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005784 goto nextByte;
5785 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 for (j = 0; j < digits; ++j) {
5787 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005788 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005789 endinpos = (s+j+1)-starts;
5790 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005791 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 errors, &errorHandler,
5793 "unicodeescape", message,
5794 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005795 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005796 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005797 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005799 }
5800 chr = (chr<<4) & ~0xF;
5801 if (c >= '0' && c <= '9')
5802 chr += c - '0';
5803 else if (c >= 'a' && c <= 'f')
5804 chr += 10 + c - 'a';
5805 else
5806 chr += 10 + c - 'A';
5807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005808 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005809 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 /* _decoding_error will have already written into the
5811 target buffer. */
5812 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005813 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005814 /* when we get here, chr is a 32-bit unicode character */
5815 if (chr <= 0xffff)
5816 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005817 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005818 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005819 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005820 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005821#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005822 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005823#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005824 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005825 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5826 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005827#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005828 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005829 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005831 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 errors, &errorHandler,
5833 "unicodeescape", "illegal Unicode character",
5834 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005835 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005836 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005837 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005838 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005839 break;
5840
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 case 'N':
5843 message = "malformed \\N character escape";
5844 if (ucnhash_CAPI == NULL) {
5845 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005846 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5847 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 if (ucnhash_CAPI == NULL)
5849 goto ucnhashError;
5850 }
5851 if (*s == '{') {
5852 const char *start = s+1;
5853 /* look for the closing brace */
5854 while (*s != '}' && s < end)
5855 s++;
5856 if (s > start && s < end && *s == '}') {
5857 /* found a name. look it up in the unicode database */
5858 message = "unknown Unicode character name";
5859 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005860 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5861 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005862 goto store;
5863 }
5864 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005866 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 errors, &errorHandler,
5869 "unicodeescape", message,
5870 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005871 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005872 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005873 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005874 break;
5875
5876 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005877 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005878 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879 message = "\\ at end of string";
5880 s--;
5881 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005882 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005884 errors, &errorHandler,
5885 "unicodeescape", message,
5886 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005887 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005888 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005889 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005890 }
5891 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005892 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5893 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005894 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005895 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005896 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005898 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005900 /* Ensure the length prediction worked in case of ASCII strings */
5901 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5902
Victor Stinnerfe226c02011-10-03 03:52:20 +02005903 if (kind == PyUnicode_WCHAR_KIND)
5904 {
5905 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5906 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005907 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005908 Py_XDECREF(errorHandler);
5909 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005910#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005911 if (_PyUnicode_READY_REPLACE(&v)) {
5912 Py_DECREF(v);
5913 return NULL;
5914 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005915#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005916 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005918
Benjamin Peterson29060642009-01-31 22:14:21 +00005919 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005920 PyErr_SetString(
5921 PyExc_UnicodeError,
5922 "\\N escapes not supported (can't load unicodedata module)"
5923 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005924 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005925 Py_XDECREF(errorHandler);
5926 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005927 return NULL;
5928
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005930 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005931 Py_XDECREF(errorHandler);
5932 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 return NULL;
5934}
5935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005936#undef WRITE_ASCII_OR_WSTR
5937#undef WRITE_WSTR
5938
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939/* Return a Unicode-Escape string version of the Unicode object.
5940
5941 If quotes is true, the string is enclosed in u"" or u'' quotes as
5942 appropriate.
5943
5944*/
5945
Walter Dörwald79e913e2007-05-12 11:08:06 +00005946static const char *hexdigits = "0123456789abcdef";
5947
Alexander Belopolsky40018472011-02-26 01:02:56 +00005948PyObject *
5949PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005950 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005951{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005952 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005955#ifdef Py_UNICODE_WIDE
5956 const Py_ssize_t expandsize = 10;
5957#else
5958 const Py_ssize_t expandsize = 6;
5959#endif
5960
Thomas Wouters89f507f2006-12-13 04:49:30 +00005961 /* XXX(nnorwitz): rather than over-allocating, it would be
5962 better to choose a different scheme. Perhaps scan the
5963 first N-chars of the string and allocate based on that size.
5964 */
5965 /* Initial allocation is based on the longest-possible unichr
5966 escape.
5967
5968 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5969 unichr, so in this case it's the longest unichr escape. In
5970 narrow (UTF-16) builds this is five chars per source unichr
5971 since there are two unichrs in the surrogate pair, so in narrow
5972 (UTF-16) builds it's not the longest unichr escape.
5973
5974 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5975 so in the narrow (UTF-16) build case it's the longest unichr
5976 escape.
5977 */
5978
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005979 if (size == 0)
5980 return PyBytes_FromStringAndSize(NULL, 0);
5981
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005982 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005983 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005984
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005985 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005986 2
5987 + expandsize*size
5988 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005989 if (repr == NULL)
5990 return NULL;
5991
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005992 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 while (size-- > 0) {
5995 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005996
Walter Dörwald79e913e2007-05-12 11:08:06 +00005997 /* Escape backslashes */
5998 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 *p++ = '\\';
6000 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006001 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006002 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006003
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00006004#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006005 /* Map 21-bit characters to '\U00xxxxxx' */
6006 else if (ch >= 0x10000) {
6007 *p++ = '\\';
6008 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006009 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
6010 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
6011 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
6012 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
6013 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
6014 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
6015 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
6016 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006018 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006019#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006020 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6021 else if (ch >= 0xD800 && ch < 0xDC00) {
6022 Py_UNICODE ch2;
6023 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006024
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 ch2 = *s++;
6026 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006027 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006028 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6029 *p++ = '\\';
6030 *p++ = 'U';
6031 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
6032 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
6033 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
6034 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
6035 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
6036 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
6037 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
6038 *p++ = hexdigits[ucs & 0x0000000F];
6039 continue;
6040 }
6041 /* Fall through: isolated surrogates are copied as-is */
6042 s--;
6043 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006044 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006045#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006046
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006048 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 *p++ = '\\';
6050 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006051 *p++ = hexdigits[(ch >> 12) & 0x000F];
6052 *p++ = hexdigits[(ch >> 8) & 0x000F];
6053 *p++ = hexdigits[(ch >> 4) & 0x000F];
6054 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006056
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006057 /* Map special whitespace to '\t', \n', '\r' */
6058 else if (ch == '\t') {
6059 *p++ = '\\';
6060 *p++ = 't';
6061 }
6062 else if (ch == '\n') {
6063 *p++ = '\\';
6064 *p++ = 'n';
6065 }
6066 else if (ch == '\r') {
6067 *p++ = '\\';
6068 *p++ = 'r';
6069 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006070
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006071 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006072 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006074 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00006075 *p++ = hexdigits[(ch >> 4) & 0x000F];
6076 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006077 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006078
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079 /* Copy everything else as-is */
6080 else
6081 *p++ = (char) ch;
6082 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006084 assert(p - PyBytes_AS_STRING(repr) > 0);
6085 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6086 return NULL;
6087 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088}
6089
Alexander Belopolsky40018472011-02-26 01:02:56 +00006090PyObject *
6091PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006093 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006094 if (!PyUnicode_Check(unicode)) {
6095 PyErr_BadArgument();
6096 return NULL;
6097 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006098 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6099 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006100 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101}
6102
6103/* --- Raw Unicode Escape Codec ------------------------------------------- */
6104
Alexander Belopolsky40018472011-02-26 01:02:56 +00006105PyObject *
6106PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006107 Py_ssize_t size,
6108 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006110 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006111 Py_ssize_t startinpos;
6112 Py_ssize_t endinpos;
6113 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006115 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116 const char *end;
6117 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006118 PyObject *errorHandler = NULL;
6119 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006120
Guido van Rossumd57fd912000-03-10 22:53:23 +00006121 /* Escaped strings will always be longer than the resulting
6122 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123 length after conversion to the true value. (But decoding error
6124 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006125 v = _PyUnicode_New(size);
6126 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006129 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006130 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131 end = s + size;
6132 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006133 unsigned char c;
6134 Py_UCS4 x;
6135 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006136 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137
Benjamin Peterson29060642009-01-31 22:14:21 +00006138 /* Non-escape characters are interpreted as Unicode ordinals */
6139 if (*s != '\\') {
6140 *p++ = (unsigned char)*s++;
6141 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006142 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 startinpos = s-starts;
6144
6145 /* \u-escapes are only interpreted iff the number of leading
6146 backslashes if odd */
6147 bs = s;
6148 for (;s < end;) {
6149 if (*s != '\\')
6150 break;
6151 *p++ = (unsigned char)*s++;
6152 }
6153 if (((s - bs) & 1) == 0 ||
6154 s >= end ||
6155 (*s != 'u' && *s != 'U')) {
6156 continue;
6157 }
6158 p--;
6159 count = *s=='u' ? 4 : 8;
6160 s++;
6161
6162 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6163 outpos = p-PyUnicode_AS_UNICODE(v);
6164 for (x = 0, i = 0; i < count; ++i, ++s) {
6165 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006166 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006167 endinpos = s-starts;
6168 if (unicode_decode_call_errorhandler(
6169 errors, &errorHandler,
6170 "rawunicodeescape", "truncated \\uXXXX",
6171 &starts, &end, &startinpos, &endinpos, &exc, &s,
6172 &v, &outpos, &p))
6173 goto onError;
6174 goto nextByte;
6175 }
6176 x = (x<<4) & ~0xF;
6177 if (c >= '0' && c <= '9')
6178 x += c - '0';
6179 else if (c >= 'a' && c <= 'f')
6180 x += 10 + c - 'a';
6181 else
6182 x += 10 + c - 'A';
6183 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006184 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006185 /* UCS-2 character */
6186 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006187 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006188 /* UCS-4 character. Either store directly, or as
6189 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006190#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006192#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006193 x -= 0x10000L;
6194 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6195 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006196#endif
6197 } else {
6198 endinpos = s-starts;
6199 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006200 if (unicode_decode_call_errorhandler(
6201 errors, &errorHandler,
6202 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006203 &starts, &end, &startinpos, &endinpos, &exc, &s,
6204 &v, &outpos, &p))
6205 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006206 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006207 nextByte:
6208 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006210 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006211 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006212 Py_XDECREF(errorHandler);
6213 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006214#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006215 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006216 Py_DECREF(v);
6217 return NULL;
6218 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006219#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006220 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006222
Benjamin Peterson29060642009-01-31 22:14:21 +00006223 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006225 Py_XDECREF(errorHandler);
6226 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return NULL;
6228}
6229
Alexander Belopolsky40018472011-02-26 01:02:56 +00006230PyObject *
6231PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006232 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006234 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235 char *p;
6236 char *q;
6237
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006238#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006239 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006240#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006241 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006242#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006243
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006244 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006246
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006247 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 if (repr == NULL)
6249 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006250 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006251 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006252
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006253 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 while (size-- > 0) {
6255 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006256#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006257 /* Map 32-bit characters to '\Uxxxxxxxx' */
6258 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006259 *p++ = '\\';
6260 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006261 *p++ = hexdigits[(ch >> 28) & 0xf];
6262 *p++ = hexdigits[(ch >> 24) & 0xf];
6263 *p++ = hexdigits[(ch >> 20) & 0xf];
6264 *p++ = hexdigits[(ch >> 16) & 0xf];
6265 *p++ = hexdigits[(ch >> 12) & 0xf];
6266 *p++ = hexdigits[(ch >> 8) & 0xf];
6267 *p++ = hexdigits[(ch >> 4) & 0xf];
6268 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006269 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006270 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006271#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006272 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6273 if (ch >= 0xD800 && ch < 0xDC00) {
6274 Py_UNICODE ch2;
6275 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006276
Benjamin Peterson29060642009-01-31 22:14:21 +00006277 ch2 = *s++;
6278 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006279 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006280 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6281 *p++ = '\\';
6282 *p++ = 'U';
6283 *p++ = hexdigits[(ucs >> 28) & 0xf];
6284 *p++ = hexdigits[(ucs >> 24) & 0xf];
6285 *p++ = hexdigits[(ucs >> 20) & 0xf];
6286 *p++ = hexdigits[(ucs >> 16) & 0xf];
6287 *p++ = hexdigits[(ucs >> 12) & 0xf];
6288 *p++ = hexdigits[(ucs >> 8) & 0xf];
6289 *p++ = hexdigits[(ucs >> 4) & 0xf];
6290 *p++ = hexdigits[ucs & 0xf];
6291 continue;
6292 }
6293 /* Fall through: isolated surrogates are copied as-is */
6294 s--;
6295 size++;
6296 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006297#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006298 /* Map 16-bit characters to '\uxxxx' */
6299 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300 *p++ = '\\';
6301 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006302 *p++ = hexdigits[(ch >> 12) & 0xf];
6303 *p++ = hexdigits[(ch >> 8) & 0xf];
6304 *p++ = hexdigits[(ch >> 4) & 0xf];
6305 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006307 /* Copy everything else as-is */
6308 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006309 *p++ = (char) ch;
6310 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006311 size = p - q;
6312
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006313 assert(size > 0);
6314 if (_PyBytes_Resize(&repr, size) < 0)
6315 return NULL;
6316 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006317}
6318
Alexander Belopolsky40018472011-02-26 01:02:56 +00006319PyObject *
6320PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006321{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006322 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006324 PyErr_BadArgument();
6325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006326 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006327 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6328 PyUnicode_GET_SIZE(unicode));
6329
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006330 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331}
6332
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006333/* --- Unicode Internal Codec ------------------------------------------- */
6334
Alexander Belopolsky40018472011-02-26 01:02:56 +00006335PyObject *
6336_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006337 Py_ssize_t size,
6338 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006339{
6340 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006341 Py_ssize_t startinpos;
6342 Py_ssize_t endinpos;
6343 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006344 PyUnicodeObject *v;
6345 Py_UNICODE *p;
6346 const char *end;
6347 const char *reason;
6348 PyObject *errorHandler = NULL;
6349 PyObject *exc = NULL;
6350
Neal Norwitzd43069c2006-01-08 01:12:10 +00006351#ifdef Py_UNICODE_WIDE
6352 Py_UNICODE unimax = PyUnicode_GetMax();
6353#endif
6354
Thomas Wouters89f507f2006-12-13 04:49:30 +00006355 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006356 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6357 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006359 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6360 as string was created with the old API. */
6361 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006363 p = PyUnicode_AS_UNICODE(v);
6364 end = s + size;
6365
6366 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006367 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006368 /* We have to sanity check the raw data, otherwise doom looms for
6369 some malformed UCS-4 data. */
6370 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006371#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006372 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006373#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006374 end-s < Py_UNICODE_SIZE
6375 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006377 startinpos = s - starts;
6378 if (end-s < Py_UNICODE_SIZE) {
6379 endinpos = end-starts;
6380 reason = "truncated input";
6381 }
6382 else {
6383 endinpos = s - starts + Py_UNICODE_SIZE;
6384 reason = "illegal code point (> 0x10FFFF)";
6385 }
6386 outpos = p - PyUnicode_AS_UNICODE(v);
6387 if (unicode_decode_call_errorhandler(
6388 errors, &errorHandler,
6389 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006390 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006391 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006392 goto onError;
6393 }
6394 }
6395 else {
6396 p++;
6397 s += Py_UNICODE_SIZE;
6398 }
6399 }
6400
Victor Stinnerfe226c02011-10-03 03:52:20 +02006401 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006402 goto onError;
6403 Py_XDECREF(errorHandler);
6404 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006405#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006406 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006407 Py_DECREF(v);
6408 return NULL;
6409 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006410#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006411 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006412 return (PyObject *)v;
6413
Benjamin Peterson29060642009-01-31 22:14:21 +00006414 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006415 Py_XDECREF(v);
6416 Py_XDECREF(errorHandler);
6417 Py_XDECREF(exc);
6418 return NULL;
6419}
6420
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421/* --- Latin-1 Codec ------------------------------------------------------ */
6422
Alexander Belopolsky40018472011-02-26 01:02:56 +00006423PyObject *
6424PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006425 Py_ssize_t size,
6426 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006429 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430}
6431
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006432/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006433static void
6434make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006435 const char *encoding,
6436 const Py_UNICODE *unicode, Py_ssize_t size,
6437 Py_ssize_t startpos, Py_ssize_t endpos,
6438 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006440 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 *exceptionObject = PyUnicodeEncodeError_Create(
6442 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 }
6444 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6446 goto onError;
6447 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6448 goto onError;
6449 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6450 goto onError;
6451 return;
6452 onError:
6453 Py_DECREF(*exceptionObject);
6454 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006455 }
6456}
6457
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006458/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006459static void
6460raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006461 const char *encoding,
6462 const Py_UNICODE *unicode, Py_ssize_t size,
6463 Py_ssize_t startpos, Py_ssize_t endpos,
6464 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006465{
6466 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006468 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006470}
6471
6472/* error handling callback helper:
6473 build arguments, call the callback and check the arguments,
6474 put the result into newpos and return the replacement string, which
6475 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006476static PyObject *
6477unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006478 PyObject **errorHandler,
6479 const char *encoding, const char *reason,
6480 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6481 Py_ssize_t startpos, Py_ssize_t endpos,
6482 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006483{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006484 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006485
6486 PyObject *restuple;
6487 PyObject *resunicode;
6488
6489 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006490 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006491 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006493 }
6494
6495 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006497 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006499
6500 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006502 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006504 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006505 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 Py_DECREF(restuple);
6507 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006508 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006509 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006510 &resunicode, newpos)) {
6511 Py_DECREF(restuple);
6512 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006514 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6515 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6516 Py_DECREF(restuple);
6517 return NULL;
6518 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006519 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006521 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6523 Py_DECREF(restuple);
6524 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006525 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006526 Py_INCREF(resunicode);
6527 Py_DECREF(restuple);
6528 return resunicode;
6529}
6530
Alexander Belopolsky40018472011-02-26 01:02:56 +00006531static PyObject *
6532unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006533 Py_ssize_t size,
6534 const char *errors,
6535 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536{
6537 /* output object */
6538 PyObject *res;
6539 /* pointers to the beginning and end+1 of input */
6540 const Py_UNICODE *startp = p;
6541 const Py_UNICODE *endp = p + size;
6542 /* pointer to the beginning of the unencodable characters */
6543 /* const Py_UNICODE *badp = NULL; */
6544 /* pointer into the output */
6545 char *str;
6546 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006547 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006548 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6549 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 PyObject *errorHandler = NULL;
6551 PyObject *exc = NULL;
6552 /* the following variable is used for caching string comparisons
6553 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6554 int known_errorHandler = -1;
6555
6556 /* allocate enough for a simple encoding without
6557 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006558 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006559 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006560 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006562 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006563 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 ressize = size;
6565
6566 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006568
Benjamin Peterson29060642009-01-31 22:14:21 +00006569 /* can we encode this? */
6570 if (c<limit) {
6571 /* no overflow check, because we know that the space is enough */
6572 *str++ = (char)c;
6573 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006574 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006575 else {
6576 Py_ssize_t unicodepos = p-startp;
6577 Py_ssize_t requiredsize;
6578 PyObject *repunicode;
6579 Py_ssize_t repsize;
6580 Py_ssize_t newpos;
6581 Py_ssize_t respos;
6582 Py_UNICODE *uni2;
6583 /* startpos for collecting unencodable chars */
6584 const Py_UNICODE *collstart = p;
6585 const Py_UNICODE *collend = p;
6586 /* find all unecodable characters */
6587 while ((collend < endp) && ((*collend)>=limit))
6588 ++collend;
6589 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6590 if (known_errorHandler==-1) {
6591 if ((errors==NULL) || (!strcmp(errors, "strict")))
6592 known_errorHandler = 1;
6593 else if (!strcmp(errors, "replace"))
6594 known_errorHandler = 2;
6595 else if (!strcmp(errors, "ignore"))
6596 known_errorHandler = 3;
6597 else if (!strcmp(errors, "xmlcharrefreplace"))
6598 known_errorHandler = 4;
6599 else
6600 known_errorHandler = 0;
6601 }
6602 switch (known_errorHandler) {
6603 case 1: /* strict */
6604 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6605 goto onError;
6606 case 2: /* replace */
6607 while (collstart++<collend)
6608 *str++ = '?'; /* fall through */
6609 case 3: /* ignore */
6610 p = collend;
6611 break;
6612 case 4: /* xmlcharrefreplace */
6613 respos = str - PyBytes_AS_STRING(res);
6614 /* determine replacement size (temporarily (mis)uses p) */
6615 for (p = collstart, repsize = 0; p < collend; ++p) {
6616 if (*p<10)
6617 repsize += 2+1+1;
6618 else if (*p<100)
6619 repsize += 2+2+1;
6620 else if (*p<1000)
6621 repsize += 2+3+1;
6622 else if (*p<10000)
6623 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006624#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 else
6626 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006627#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006628 else if (*p<100000)
6629 repsize += 2+5+1;
6630 else if (*p<1000000)
6631 repsize += 2+6+1;
6632 else
6633 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006634#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006635 }
6636 requiredsize = respos+repsize+(endp-collend);
6637 if (requiredsize > ressize) {
6638 if (requiredsize<2*ressize)
6639 requiredsize = 2*ressize;
6640 if (_PyBytes_Resize(&res, requiredsize))
6641 goto onError;
6642 str = PyBytes_AS_STRING(res) + respos;
6643 ressize = requiredsize;
6644 }
6645 /* generate replacement (temporarily (mis)uses p) */
6646 for (p = collstart; p < collend; ++p) {
6647 str += sprintf(str, "&#%d;", (int)*p);
6648 }
6649 p = collend;
6650 break;
6651 default:
6652 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6653 encoding, reason, startp, size, &exc,
6654 collstart-startp, collend-startp, &newpos);
6655 if (repunicode == NULL)
6656 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006657 if (PyBytes_Check(repunicode)) {
6658 /* Directly copy bytes result to output. */
6659 repsize = PyBytes_Size(repunicode);
6660 if (repsize > 1) {
6661 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006662 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006663 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6664 Py_DECREF(repunicode);
6665 goto onError;
6666 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006667 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006668 ressize += repsize-1;
6669 }
6670 memcpy(str, PyBytes_AsString(repunicode), repsize);
6671 str += repsize;
6672 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006673 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006674 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006675 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006676 /* need more space? (at least enough for what we
6677 have+the replacement+the rest of the string, so
6678 we won't have to check space for encodable characters) */
6679 respos = str - PyBytes_AS_STRING(res);
6680 repsize = PyUnicode_GET_SIZE(repunicode);
6681 requiredsize = respos+repsize+(endp-collend);
6682 if (requiredsize > ressize) {
6683 if (requiredsize<2*ressize)
6684 requiredsize = 2*ressize;
6685 if (_PyBytes_Resize(&res, requiredsize)) {
6686 Py_DECREF(repunicode);
6687 goto onError;
6688 }
6689 str = PyBytes_AS_STRING(res) + respos;
6690 ressize = requiredsize;
6691 }
6692 /* check if there is anything unencodable in the replacement
6693 and copy it to the output */
6694 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6695 c = *uni2;
6696 if (c >= limit) {
6697 raise_encode_exception(&exc, encoding, startp, size,
6698 unicodepos, unicodepos+1, reason);
6699 Py_DECREF(repunicode);
6700 goto onError;
6701 }
6702 *str = (char)c;
6703 }
6704 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006705 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006706 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006707 }
6708 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006709 /* Resize if we allocated to much */
6710 size = str - PyBytes_AS_STRING(res);
6711 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006712 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006713 if (_PyBytes_Resize(&res, size) < 0)
6714 goto onError;
6715 }
6716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 Py_XDECREF(errorHandler);
6718 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006719 return res;
6720
6721 onError:
6722 Py_XDECREF(res);
6723 Py_XDECREF(errorHandler);
6724 Py_XDECREF(exc);
6725 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006726}
6727
Alexander Belopolsky40018472011-02-26 01:02:56 +00006728PyObject *
6729PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006730 Py_ssize_t size,
6731 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006733 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006737_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738{
6739 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 PyErr_BadArgument();
6741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006742 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006743 if (PyUnicode_READY(unicode) == -1)
6744 return NULL;
6745 /* Fast path: if it is a one-byte string, construct
6746 bytes object directly. */
6747 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6748 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6749 PyUnicode_GET_LENGTH(unicode));
6750 /* Non-Latin-1 characters present. Defer to above function to
6751 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006753 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006754 errors);
6755}
6756
6757PyObject*
6758PyUnicode_AsLatin1String(PyObject *unicode)
6759{
6760 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761}
6762
6763/* --- 7-bit ASCII Codec -------------------------------------------------- */
6764
Alexander Belopolsky40018472011-02-26 01:02:56 +00006765PyObject *
6766PyUnicode_DecodeASCII(const char *s,
6767 Py_ssize_t size,
6768 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006771 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006772 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006773 Py_ssize_t startinpos;
6774 Py_ssize_t endinpos;
6775 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006776 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006777 int has_error;
6778 const unsigned char *p = (const unsigned char *)s;
6779 const unsigned char *end = p + size;
6780 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006781 PyObject *errorHandler = NULL;
6782 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006783
Guido van Rossumd57fd912000-03-10 22:53:23 +00006784 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006785 if (size == 1 && (unsigned char)s[0] < 128)
6786 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006787
Victor Stinner702c7342011-10-05 13:50:52 +02006788 has_error = 0;
6789 while (p < end && !has_error) {
6790 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6791 an explanation. */
6792 if (!((size_t) p & LONG_PTR_MASK)) {
6793 /* Help register allocation */
6794 register const unsigned char *_p = p;
6795 while (_p < aligned_end) {
6796 unsigned long value = *(unsigned long *) _p;
6797 if (value & ASCII_CHAR_MASK) {
6798 has_error = 1;
6799 break;
6800 }
6801 _p += SIZEOF_LONG;
6802 }
6803 if (_p == end)
6804 break;
6805 if (has_error)
6806 break;
6807 p = _p;
6808 }
6809 if (*p & 0x80) {
6810 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006811 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006812 }
6813 else {
6814 ++p;
6815 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006816 }
Victor Stinner702c7342011-10-05 13:50:52 +02006817 if (!has_error)
6818 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006819
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 v = _PyUnicode_New(size);
6821 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006824 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006825 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006826 e = s + size;
6827 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 register unsigned char c = (unsigned char)*s;
6829 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006830 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 ++s;
6832 }
6833 else {
6834 startinpos = s-starts;
6835 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006836 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006837 if (unicode_decode_call_errorhandler(
6838 errors, &errorHandler,
6839 "ascii", "ordinal not in range(128)",
6840 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006841 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006842 goto onError;
6843 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006844 }
Victor Stinner702c7342011-10-05 13:50:52 +02006845 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6846 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006847 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006848 Py_XDECREF(errorHandler);
6849 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006850#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006851 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006852 Py_DECREF(v);
6853 return NULL;
6854 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006855#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006856 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006858
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006860 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 Py_XDECREF(errorHandler);
6862 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 return NULL;
6864}
6865
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866PyObject *
6867PyUnicode_EncodeASCII(const Py_UNICODE *p,
6868 Py_ssize_t size,
6869 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006871 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006872}
6873
Alexander Belopolsky40018472011-02-26 01:02:56 +00006874PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006875_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876{
6877 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 PyErr_BadArgument();
6879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006881 if (PyUnicode_READY(unicode) == -1)
6882 return NULL;
6883 /* Fast path: if it is an ASCII-only string, construct bytes object
6884 directly. Else defer to above function to raise the exception. */
6885 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6886 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6887 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006890 errors);
6891}
6892
6893PyObject *
6894PyUnicode_AsASCIIString(PyObject *unicode)
6895{
6896 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897}
6898
Victor Stinner99b95382011-07-04 14:23:54 +02006899#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006900
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006901/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006902
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006903#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006904#define NEED_RETRY
6905#endif
6906
6907/* XXX This code is limited to "true" double-byte encodings, as
6908 a) it assumes an incomplete character consists of a single byte, and
6909 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006911
Alexander Belopolsky40018472011-02-26 01:02:56 +00006912static int
6913is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006914{
6915 const char *curr = s + offset;
6916
6917 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 const char *prev = CharPrev(s, curr);
6919 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006920 }
6921 return 0;
6922}
6923
6924/*
6925 * Decode MBCS string into unicode object. If 'final' is set, converts
6926 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6927 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006928static int
6929decode_mbcs(PyUnicodeObject **v,
6930 const char *s, /* MBCS string */
6931 int size, /* sizeof MBCS string */
6932 int final,
6933 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934{
6935 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006936 Py_ssize_t n;
6937 DWORD usize;
6938 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006939
6940 assert(size >= 0);
6941
Victor Stinner554f3f02010-06-16 23:33:54 +00006942 /* check and handle 'errors' arg */
6943 if (errors==NULL || strcmp(errors, "strict")==0)
6944 flags = MB_ERR_INVALID_CHARS;
6945 else if (strcmp(errors, "ignore")==0)
6946 flags = 0;
6947 else {
6948 PyErr_Format(PyExc_ValueError,
6949 "mbcs encoding does not support errors='%s'",
6950 errors);
6951 return -1;
6952 }
6953
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954 /* Skip trailing lead-byte unless 'final' is set */
6955 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006956 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006957
6958 /* First get the size of the result */
6959 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006960 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6961 if (usize==0)
6962 goto mbcs_decode_error;
6963 } else
6964 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006965
6966 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006967 /* Create unicode object */
6968 *v = _PyUnicode_New(usize);
6969 if (*v == NULL)
6970 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006971 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006972 }
6973 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006974 /* Extend unicode object */
6975 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006976 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978 }
6979
6980 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006981 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006983 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6984 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006985 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006987 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006988
6989mbcs_decode_error:
6990 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6991 we raise a UnicodeDecodeError - else it is a 'generic'
6992 windows error
6993 */
6994 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6995 /* Ideally, we should get reason from FormatMessage - this
6996 is the Windows 2000 English version of the message
6997 */
6998 PyObject *exc = NULL;
6999 const char *reason = "No mapping for the Unicode character exists "
7000 "in the target multi-byte code page.";
7001 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
7002 if (exc != NULL) {
7003 PyCodec_StrictErrors(exc);
7004 Py_DECREF(exc);
7005 }
7006 } else {
7007 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7008 }
7009 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007010}
7011
Alexander Belopolsky40018472011-02-26 01:02:56 +00007012PyObject *
7013PyUnicode_DecodeMBCSStateful(const char *s,
7014 Py_ssize_t size,
7015 const char *errors,
7016 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007017{
7018 PyUnicodeObject *v = NULL;
7019 int done;
7020
7021 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007023
7024#ifdef NEED_RETRY
7025 retry:
7026 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007027 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007028 else
7029#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007030 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007031
7032 if (done < 0) {
7033 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007034 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007035 }
7036
7037 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007038 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039
7040#ifdef NEED_RETRY
7041 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 s += done;
7043 size -= done;
7044 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007045 }
7046#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02007047#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007048 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007049 Py_DECREF(v);
7050 return NULL;
7051 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007052#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007053 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007054 return (PyObject *)v;
7055}
7056
Alexander Belopolsky40018472011-02-26 01:02:56 +00007057PyObject *
7058PyUnicode_DecodeMBCS(const char *s,
7059 Py_ssize_t size,
7060 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007061{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007062 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7063}
7064
7065/*
7066 * Convert unicode into string object (MBCS).
7067 * Returns 0 if succeed, -1 otherwise.
7068 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007069static int
7070encode_mbcs(PyObject **repr,
7071 const Py_UNICODE *p, /* unicode */
7072 int size, /* size of unicode */
7073 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074{
Victor Stinner554f3f02010-06-16 23:33:54 +00007075 BOOL usedDefaultChar = FALSE;
7076 BOOL *pusedDefaultChar;
7077 int mbcssize;
7078 Py_ssize_t n;
7079 PyObject *exc = NULL;
7080 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081
7082 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007083
Victor Stinner554f3f02010-06-16 23:33:54 +00007084 /* check and handle 'errors' arg */
7085 if (errors==NULL || strcmp(errors, "strict")==0) {
7086 flags = WC_NO_BEST_FIT_CHARS;
7087 pusedDefaultChar = &usedDefaultChar;
7088 } else if (strcmp(errors, "replace")==0) {
7089 flags = 0;
7090 pusedDefaultChar = NULL;
7091 } else {
7092 PyErr_Format(PyExc_ValueError,
7093 "mbcs encoding does not support errors='%s'",
7094 errors);
7095 return -1;
7096 }
7097
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007098 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00007100 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
7101 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 if (mbcssize == 0) {
7103 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7104 return -1;
7105 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007106 /* If we used a default char, then we failed! */
7107 if (pusedDefaultChar && *pusedDefaultChar)
7108 goto mbcs_encode_error;
7109 } else {
7110 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007111 }
7112
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007113 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007114 /* Create string object */
7115 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
7116 if (*repr == NULL)
7117 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00007118 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007119 }
7120 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 /* Extend string object */
7122 n = PyBytes_Size(*repr);
7123 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
7124 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125 }
7126
7127 /* Do the conversion */
7128 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00007130 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
7131 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 PyErr_SetFromWindowsErrWithFilename(0, NULL);
7133 return -1;
7134 }
Victor Stinner554f3f02010-06-16 23:33:54 +00007135 if (pusedDefaultChar && *pusedDefaultChar)
7136 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007139
7140mbcs_encode_error:
7141 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
7142 Py_XDECREF(exc);
7143 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007144}
7145
Alexander Belopolsky40018472011-02-26 01:02:56 +00007146PyObject *
7147PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7148 Py_ssize_t size,
7149 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007150{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007151 PyObject *repr = NULL;
7152 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007153
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007154#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007155 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007156 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00007157 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007158 else
7159#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00007160 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007161
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 Py_XDECREF(repr);
7164 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007165 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166
7167#ifdef NEED_RETRY
7168 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 p += INT_MAX;
7170 size -= INT_MAX;
7171 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172 }
7173#endif
7174
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007175 return repr;
7176}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007177
Alexander Belopolsky40018472011-02-26 01:02:56 +00007178PyObject *
7179PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007180{
7181 if (!PyUnicode_Check(unicode)) {
7182 PyErr_BadArgument();
7183 return NULL;
7184 }
7185 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007186 PyUnicode_GET_SIZE(unicode),
7187 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007188}
7189
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190#undef NEED_RETRY
7191
Victor Stinner99b95382011-07-04 14:23:54 +02007192#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007193
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194/* --- Character Mapping Codec -------------------------------------------- */
7195
Alexander Belopolsky40018472011-02-26 01:02:56 +00007196PyObject *
7197PyUnicode_DecodeCharmap(const char *s,
7198 Py_ssize_t size,
7199 PyObject *mapping,
7200 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007202 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007203 Py_ssize_t startinpos;
7204 Py_ssize_t endinpos;
7205 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007206 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 PyUnicodeObject *v;
7208 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007209 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007210 PyObject *errorHandler = NULL;
7211 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007212 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007213 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007214
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 /* Default to Latin-1 */
7216 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007218
7219 v = _PyUnicode_New(size);
7220 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007221 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007222 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007224 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007225 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007226 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 mapstring = PyUnicode_AS_UNICODE(mapping);
7228 maplen = PyUnicode_GET_SIZE(mapping);
7229 while (s < e) {
7230 unsigned char ch = *s;
7231 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007232
Benjamin Peterson29060642009-01-31 22:14:21 +00007233 if (ch < maplen)
7234 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007235
Benjamin Peterson29060642009-01-31 22:14:21 +00007236 if (x == 0xfffe) {
7237 /* undefined mapping */
7238 outpos = p-PyUnicode_AS_UNICODE(v);
7239 startinpos = s-starts;
7240 endinpos = startinpos+1;
7241 if (unicode_decode_call_errorhandler(
7242 errors, &errorHandler,
7243 "charmap", "character maps to <undefined>",
7244 &starts, &e, &startinpos, &endinpos, &exc, &s,
7245 &v, &outpos, &p)) {
7246 goto onError;
7247 }
7248 continue;
7249 }
7250 *p++ = x;
7251 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007252 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007253 }
7254 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007255 while (s < e) {
7256 unsigned char ch = *s;
7257 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007258
Benjamin Peterson29060642009-01-31 22:14:21 +00007259 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7260 w = PyLong_FromLong((long)ch);
7261 if (w == NULL)
7262 goto onError;
7263 x = PyObject_GetItem(mapping, w);
7264 Py_DECREF(w);
7265 if (x == NULL) {
7266 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7267 /* No mapping found means: mapping is undefined. */
7268 PyErr_Clear();
7269 x = Py_None;
7270 Py_INCREF(x);
7271 } else
7272 goto onError;
7273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007274
Benjamin Peterson29060642009-01-31 22:14:21 +00007275 /* Apply mapping */
7276 if (PyLong_Check(x)) {
7277 long value = PyLong_AS_LONG(x);
7278 if (value < 0 || value > 65535) {
7279 PyErr_SetString(PyExc_TypeError,
7280 "character mapping must be in range(65536)");
7281 Py_DECREF(x);
7282 goto onError;
7283 }
7284 *p++ = (Py_UNICODE)value;
7285 }
7286 else if (x == Py_None) {
7287 /* undefined mapping */
7288 outpos = p-PyUnicode_AS_UNICODE(v);
7289 startinpos = s-starts;
7290 endinpos = startinpos+1;
7291 if (unicode_decode_call_errorhandler(
7292 errors, &errorHandler,
7293 "charmap", "character maps to <undefined>",
7294 &starts, &e, &startinpos, &endinpos, &exc, &s,
7295 &v, &outpos, &p)) {
7296 Py_DECREF(x);
7297 goto onError;
7298 }
7299 Py_DECREF(x);
7300 continue;
7301 }
7302 else if (PyUnicode_Check(x)) {
7303 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007304
Benjamin Peterson29060642009-01-31 22:14:21 +00007305 if (targetsize == 1)
7306 /* 1-1 mapping */
7307 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007308
Benjamin Peterson29060642009-01-31 22:14:21 +00007309 else if (targetsize > 1) {
7310 /* 1-n mapping */
7311 if (targetsize > extrachars) {
7312 /* resize first */
7313 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7314 Py_ssize_t needed = (targetsize - extrachars) + \
7315 (targetsize << 2);
7316 extrachars += needed;
7317 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007318 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007319 PyUnicode_GET_SIZE(v) + needed) < 0) {
7320 Py_DECREF(x);
7321 goto onError;
7322 }
7323 p = PyUnicode_AS_UNICODE(v) + oldpos;
7324 }
7325 Py_UNICODE_COPY(p,
7326 PyUnicode_AS_UNICODE(x),
7327 targetsize);
7328 p += targetsize;
7329 extrachars -= targetsize;
7330 }
7331 /* 1-0 mapping: skip the character */
7332 }
7333 else {
7334 /* wrong return value */
7335 PyErr_SetString(PyExc_TypeError,
7336 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007337 Py_DECREF(x);
7338 goto onError;
7339 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 Py_DECREF(x);
7341 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007342 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343 }
7344 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007345 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007346 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007347 Py_XDECREF(errorHandler);
7348 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007349#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007350 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007351 Py_DECREF(v);
7352 return NULL;
7353 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007354#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007355 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007356 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007357
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007359 Py_XDECREF(errorHandler);
7360 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361 Py_XDECREF(v);
7362 return NULL;
7363}
7364
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007365/* Charmap encoding: the lookup table */
7366
Alexander Belopolsky40018472011-02-26 01:02:56 +00007367struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007368 PyObject_HEAD
7369 unsigned char level1[32];
7370 int count2, count3;
7371 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007372};
7373
7374static PyObject*
7375encoding_map_size(PyObject *obj, PyObject* args)
7376{
7377 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007378 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007380}
7381
7382static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007383 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 PyDoc_STR("Return the size (in bytes) of this object") },
7385 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007386};
7387
7388static void
7389encoding_map_dealloc(PyObject* o)
7390{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007391 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007392}
7393
7394static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007395 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007396 "EncodingMap", /*tp_name*/
7397 sizeof(struct encoding_map), /*tp_basicsize*/
7398 0, /*tp_itemsize*/
7399 /* methods */
7400 encoding_map_dealloc, /*tp_dealloc*/
7401 0, /*tp_print*/
7402 0, /*tp_getattr*/
7403 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007404 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007405 0, /*tp_repr*/
7406 0, /*tp_as_number*/
7407 0, /*tp_as_sequence*/
7408 0, /*tp_as_mapping*/
7409 0, /*tp_hash*/
7410 0, /*tp_call*/
7411 0, /*tp_str*/
7412 0, /*tp_getattro*/
7413 0, /*tp_setattro*/
7414 0, /*tp_as_buffer*/
7415 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7416 0, /*tp_doc*/
7417 0, /*tp_traverse*/
7418 0, /*tp_clear*/
7419 0, /*tp_richcompare*/
7420 0, /*tp_weaklistoffset*/
7421 0, /*tp_iter*/
7422 0, /*tp_iternext*/
7423 encoding_map_methods, /*tp_methods*/
7424 0, /*tp_members*/
7425 0, /*tp_getset*/
7426 0, /*tp_base*/
7427 0, /*tp_dict*/
7428 0, /*tp_descr_get*/
7429 0, /*tp_descr_set*/
7430 0, /*tp_dictoffset*/
7431 0, /*tp_init*/
7432 0, /*tp_alloc*/
7433 0, /*tp_new*/
7434 0, /*tp_free*/
7435 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007436};
7437
7438PyObject*
7439PyUnicode_BuildEncodingMap(PyObject* string)
7440{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007441 PyObject *result;
7442 struct encoding_map *mresult;
7443 int i;
7444 int need_dict = 0;
7445 unsigned char level1[32];
7446 unsigned char level2[512];
7447 unsigned char *mlevel1, *mlevel2, *mlevel3;
7448 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007449 int kind;
7450 void *data;
7451 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007453 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007454 PyErr_BadArgument();
7455 return NULL;
7456 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007457 kind = PyUnicode_KIND(string);
7458 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007459 memset(level1, 0xFF, sizeof level1);
7460 memset(level2, 0xFF, sizeof level2);
7461
7462 /* If there isn't a one-to-one mapping of NULL to \0,
7463 or if there are non-BMP characters, we need to use
7464 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007465 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007466 need_dict = 1;
7467 for (i = 1; i < 256; i++) {
7468 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007469 ch = PyUnicode_READ(kind, data, i);
7470 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007471 need_dict = 1;
7472 break;
7473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007474 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007475 /* unmapped character */
7476 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007477 l1 = ch >> 11;
7478 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007479 if (level1[l1] == 0xFF)
7480 level1[l1] = count2++;
7481 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007482 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007483 }
7484
7485 if (count2 >= 0xFF || count3 >= 0xFF)
7486 need_dict = 1;
7487
7488 if (need_dict) {
7489 PyObject *result = PyDict_New();
7490 PyObject *key, *value;
7491 if (!result)
7492 return NULL;
7493 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007494 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007495 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007496 if (!key || !value)
7497 goto failed1;
7498 if (PyDict_SetItem(result, key, value) == -1)
7499 goto failed1;
7500 Py_DECREF(key);
7501 Py_DECREF(value);
7502 }
7503 return result;
7504 failed1:
7505 Py_XDECREF(key);
7506 Py_XDECREF(value);
7507 Py_DECREF(result);
7508 return NULL;
7509 }
7510
7511 /* Create a three-level trie */
7512 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7513 16*count2 + 128*count3 - 1);
7514 if (!result)
7515 return PyErr_NoMemory();
7516 PyObject_Init(result, &EncodingMapType);
7517 mresult = (struct encoding_map*)result;
7518 mresult->count2 = count2;
7519 mresult->count3 = count3;
7520 mlevel1 = mresult->level1;
7521 mlevel2 = mresult->level23;
7522 mlevel3 = mresult->level23 + 16*count2;
7523 memcpy(mlevel1, level1, 32);
7524 memset(mlevel2, 0xFF, 16*count2);
7525 memset(mlevel3, 0, 128*count3);
7526 count3 = 0;
7527 for (i = 1; i < 256; i++) {
7528 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007529 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007530 /* unmapped character */
7531 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007532 o1 = PyUnicode_READ(kind, data, i)>>11;
7533 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007534 i2 = 16*mlevel1[o1] + o2;
7535 if (mlevel2[i2] == 0xFF)
7536 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007537 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007538 i3 = 128*mlevel2[i2] + o3;
7539 mlevel3[i3] = i;
7540 }
7541 return result;
7542}
7543
7544static int
7545encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7546{
7547 struct encoding_map *map = (struct encoding_map*)mapping;
7548 int l1 = c>>11;
7549 int l2 = (c>>7) & 0xF;
7550 int l3 = c & 0x7F;
7551 int i;
7552
7553#ifdef Py_UNICODE_WIDE
7554 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007555 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007556 }
7557#endif
7558 if (c == 0)
7559 return 0;
7560 /* level 1*/
7561 i = map->level1[l1];
7562 if (i == 0xFF) {
7563 return -1;
7564 }
7565 /* level 2*/
7566 i = map->level23[16*i+l2];
7567 if (i == 0xFF) {
7568 return -1;
7569 }
7570 /* level 3 */
7571 i = map->level23[16*map->count2 + 128*i + l3];
7572 if (i == 0) {
7573 return -1;
7574 }
7575 return i;
7576}
7577
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007578/* Lookup the character ch in the mapping. If the character
7579 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007580 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007581static PyObject *
7582charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007583{
Christian Heimes217cfd12007-12-02 14:31:20 +00007584 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007585 PyObject *x;
7586
7587 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007589 x = PyObject_GetItem(mapping, w);
7590 Py_DECREF(w);
7591 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007592 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7593 /* No mapping found means: mapping is undefined. */
7594 PyErr_Clear();
7595 x = Py_None;
7596 Py_INCREF(x);
7597 return x;
7598 } else
7599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007600 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007601 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007602 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007603 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007604 long value = PyLong_AS_LONG(x);
7605 if (value < 0 || value > 255) {
7606 PyErr_SetString(PyExc_TypeError,
7607 "character mapping must be in range(256)");
7608 Py_DECREF(x);
7609 return NULL;
7610 }
7611 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007612 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007613 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007615 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007616 /* wrong return value */
7617 PyErr_Format(PyExc_TypeError,
7618 "character mapping must return integer, bytes or None, not %.400s",
7619 x->ob_type->tp_name);
7620 Py_DECREF(x);
7621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007622 }
7623}
7624
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007625static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007626charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007627{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7629 /* exponentially overallocate to minimize reallocations */
7630 if (requiredsize < 2*outsize)
7631 requiredsize = 2*outsize;
7632 if (_PyBytes_Resize(outobj, requiredsize))
7633 return -1;
7634 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007635}
7636
Benjamin Peterson14339b62009-01-31 16:36:08 +00007637typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007638 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007639} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007640/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007641 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007642 space is available. Return a new reference to the object that
7643 was put in the output buffer, or Py_None, if the mapping was undefined
7644 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007645 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007646static charmapencode_result
7647charmapencode_output(Py_UNICODE c, PyObject *mapping,
7648 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007649{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007650 PyObject *rep;
7651 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007652 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007653
Christian Heimes90aa7642007-12-19 02:45:37 +00007654 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007655 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007657 if (res == -1)
7658 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007659 if (outsize<requiredsize)
7660 if (charmapencode_resize(outobj, outpos, requiredsize))
7661 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007662 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007663 outstart[(*outpos)++] = (char)res;
7664 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007665 }
7666
7667 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007668 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007670 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007671 Py_DECREF(rep);
7672 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007673 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 if (PyLong_Check(rep)) {
7675 Py_ssize_t requiredsize = *outpos+1;
7676 if (outsize<requiredsize)
7677 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7678 Py_DECREF(rep);
7679 return enc_EXCEPTION;
7680 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007681 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007683 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 else {
7685 const char *repchars = PyBytes_AS_STRING(rep);
7686 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7687 Py_ssize_t requiredsize = *outpos+repsize;
7688 if (outsize<requiredsize)
7689 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7690 Py_DECREF(rep);
7691 return enc_EXCEPTION;
7692 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007693 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007694 memcpy(outstart + *outpos, repchars, repsize);
7695 *outpos += repsize;
7696 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007697 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007698 Py_DECREF(rep);
7699 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007700}
7701
7702/* handle an error in PyUnicode_EncodeCharmap
7703 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007704static int
7705charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007706 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007707 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007708 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007709 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007710{
7711 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007712 Py_ssize_t repsize;
7713 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714 Py_UNICODE *uni2;
7715 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007716 Py_ssize_t collstartpos = *inpos;
7717 Py_ssize_t collendpos = *inpos+1;
7718 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007719 char *encoding = "charmap";
7720 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007721 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007723 /* find all unencodable characters */
7724 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007725 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007726 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007727 int res = encoding_map_lookup(p[collendpos], mapping);
7728 if (res != -1)
7729 break;
7730 ++collendpos;
7731 continue;
7732 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 rep = charmapencode_lookup(p[collendpos], mapping);
7735 if (rep==NULL)
7736 return -1;
7737 else if (rep!=Py_None) {
7738 Py_DECREF(rep);
7739 break;
7740 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007741 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007742 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007743 }
7744 /* cache callback name lookup
7745 * (if not done yet, i.e. it's the first error) */
7746 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 if ((errors==NULL) || (!strcmp(errors, "strict")))
7748 *known_errorHandler = 1;
7749 else if (!strcmp(errors, "replace"))
7750 *known_errorHandler = 2;
7751 else if (!strcmp(errors, "ignore"))
7752 *known_errorHandler = 3;
7753 else if (!strcmp(errors, "xmlcharrefreplace"))
7754 *known_errorHandler = 4;
7755 else
7756 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007757 }
7758 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759 case 1: /* strict */
7760 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7761 return -1;
7762 case 2: /* replace */
7763 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 x = charmapencode_output('?', mapping, res, respos);
7765 if (x==enc_EXCEPTION) {
7766 return -1;
7767 }
7768 else if (x==enc_FAILED) {
7769 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7770 return -1;
7771 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007772 }
7773 /* fall through */
7774 case 3: /* ignore */
7775 *inpos = collendpos;
7776 break;
7777 case 4: /* xmlcharrefreplace */
7778 /* generate replacement (temporarily (mis)uses p) */
7779 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007780 char buffer[2+29+1+1];
7781 char *cp;
7782 sprintf(buffer, "&#%d;", (int)p[collpos]);
7783 for (cp = buffer; *cp; ++cp) {
7784 x = charmapencode_output(*cp, mapping, res, respos);
7785 if (x==enc_EXCEPTION)
7786 return -1;
7787 else if (x==enc_FAILED) {
7788 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7789 return -1;
7790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007791 }
7792 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 *inpos = collendpos;
7794 break;
7795 default:
7796 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007797 encoding, reason, p, size, exceptionObject,
7798 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007799 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007801 if (PyBytes_Check(repunicode)) {
7802 /* Directly copy bytes result to output. */
7803 Py_ssize_t outsize = PyBytes_Size(*res);
7804 Py_ssize_t requiredsize;
7805 repsize = PyBytes_Size(repunicode);
7806 requiredsize = *respos + repsize;
7807 if (requiredsize > outsize)
7808 /* Make room for all additional bytes. */
7809 if (charmapencode_resize(res, respos, requiredsize)) {
7810 Py_DECREF(repunicode);
7811 return -1;
7812 }
7813 memcpy(PyBytes_AsString(*res) + *respos,
7814 PyBytes_AsString(repunicode), repsize);
7815 *respos += repsize;
7816 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007817 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007818 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007819 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007820 /* generate replacement */
7821 repsize = PyUnicode_GET_SIZE(repunicode);
7822 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 x = charmapencode_output(*uni2, mapping, res, respos);
7824 if (x==enc_EXCEPTION) {
7825 return -1;
7826 }
7827 else if (x==enc_FAILED) {
7828 Py_DECREF(repunicode);
7829 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7830 return -1;
7831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007832 }
7833 *inpos = newpos;
7834 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007835 }
7836 return 0;
7837}
7838
Alexander Belopolsky40018472011-02-26 01:02:56 +00007839PyObject *
7840PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7841 Py_ssize_t size,
7842 PyObject *mapping,
7843 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007844{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007845 /* output object */
7846 PyObject *res = NULL;
7847 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007848 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007849 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007850 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007851 PyObject *errorHandler = NULL;
7852 PyObject *exc = NULL;
7853 /* the following variable is used for caching string comparisons
7854 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7855 * 3=ignore, 4=xmlcharrefreplace */
7856 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007857
7858 /* Default to Latin-1 */
7859 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007861
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007862 /* allocate enough for a simple encoding without
7863 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007864 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865 if (res == NULL)
7866 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007867 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007869
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007870 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007871 /* try to encode it */
7872 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7873 if (x==enc_EXCEPTION) /* error */
7874 goto onError;
7875 if (x==enc_FAILED) { /* unencodable character */
7876 if (charmap_encoding_error(p, size, &inpos, mapping,
7877 &exc,
7878 &known_errorHandler, &errorHandler, errors,
7879 &res, &respos)) {
7880 goto onError;
7881 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007882 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007883 else
7884 /* done with this character => adjust input position */
7885 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007886 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007887
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007888 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007889 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007890 if (_PyBytes_Resize(&res, respos) < 0)
7891 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007892
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 Py_XDECREF(exc);
7894 Py_XDECREF(errorHandler);
7895 return res;
7896
Benjamin Peterson29060642009-01-31 22:14:21 +00007897 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007898 Py_XDECREF(res);
7899 Py_XDECREF(exc);
7900 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007901 return NULL;
7902}
7903
Alexander Belopolsky40018472011-02-26 01:02:56 +00007904PyObject *
7905PyUnicode_AsCharmapString(PyObject *unicode,
7906 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007907{
7908 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 PyErr_BadArgument();
7910 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007911 }
7912 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007913 PyUnicode_GET_SIZE(unicode),
7914 mapping,
7915 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007916}
7917
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007918/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007919static void
7920make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007921 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007922 Py_ssize_t startpos, Py_ssize_t endpos,
7923 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007924{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007925 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007926 *exceptionObject = _PyUnicodeTranslateError_Create(
7927 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007928 }
7929 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7931 goto onError;
7932 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7933 goto onError;
7934 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7935 goto onError;
7936 return;
7937 onError:
7938 Py_DECREF(*exceptionObject);
7939 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007940 }
7941}
7942
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007943/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007944static void
7945raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007946 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007947 Py_ssize_t startpos, Py_ssize_t endpos,
7948 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007949{
7950 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007951 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007952 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954}
7955
7956/* error handling callback helper:
7957 build arguments, call the callback and check the arguments,
7958 put the result into newpos and return the replacement string, which
7959 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007960static PyObject *
7961unicode_translate_call_errorhandler(const char *errors,
7962 PyObject **errorHandler,
7963 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007964 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007965 Py_ssize_t startpos, Py_ssize_t endpos,
7966 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007967{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007968 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007970 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007971 PyObject *restuple;
7972 PyObject *resunicode;
7973
7974 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007976 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007978 }
7979
7980 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007981 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007982 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007984
7985 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007986 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007987 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007990 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007991 Py_DECREF(restuple);
7992 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007993 }
7994 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 &resunicode, &i_newpos)) {
7996 Py_DECREF(restuple);
7997 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007998 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007999 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008000 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008001 else
8002 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008003 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8005 Py_DECREF(restuple);
8006 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008007 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008008 Py_INCREF(resunicode);
8009 Py_DECREF(restuple);
8010 return resunicode;
8011}
8012
8013/* Lookup the character ch in the mapping and put the result in result,
8014 which must be decrefed by the caller.
8015 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008016static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008017charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018{
Christian Heimes217cfd12007-12-02 14:31:20 +00008019 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 PyObject *x;
8021
8022 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008024 x = PyObject_GetItem(mapping, w);
8025 Py_DECREF(w);
8026 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8028 /* No mapping found means: use 1:1 mapping. */
8029 PyErr_Clear();
8030 *result = NULL;
8031 return 0;
8032 } else
8033 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008034 }
8035 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 *result = x;
8037 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008038 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008039 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 long value = PyLong_AS_LONG(x);
8041 long max = PyUnicode_GetMax();
8042 if (value < 0 || value > max) {
8043 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008044 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008045 Py_DECREF(x);
8046 return -1;
8047 }
8048 *result = x;
8049 return 0;
8050 }
8051 else if (PyUnicode_Check(x)) {
8052 *result = x;
8053 return 0;
8054 }
8055 else {
8056 /* wrong return value */
8057 PyErr_SetString(PyExc_TypeError,
8058 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008059 Py_DECREF(x);
8060 return -1;
8061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008062}
8063/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008064 if not reallocate and adjust various state variables.
8065 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008066static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008067charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008071 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008072 /* exponentially overallocate to minimize reallocations */
8073 if (requiredsize < 2 * oldsize)
8074 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008075 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8076 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008077 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008078 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079 }
8080 return 0;
8081}
8082/* lookup the character, put the result in the output string and adjust
8083 various state variables. Return a new reference to the object that
8084 was put in the output buffer in *result, or Py_None, if the mapping was
8085 undefined (in which case no character was written).
8086 The called must decref result.
8087 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008088static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008089charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8090 PyObject *mapping, Py_UCS4 **output,
8091 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008093{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008094 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8095 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008099 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008100 }
8101 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008103 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008105 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008106 }
8107 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 Py_ssize_t repsize;
8109 if (PyUnicode_READY(*res) == -1)
8110 return -1;
8111 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 if (repsize==1) {
8113 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008114 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 }
8116 else if (repsize!=0) {
8117 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008118 Py_ssize_t requiredsize = *opos +
8119 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008121 Py_ssize_t i;
8122 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008123 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008124 for(i = 0; i < repsize; i++)
8125 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008127 }
8128 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008130 return 0;
8131}
8132
Alexander Belopolsky40018472011-02-26 01:02:56 +00008133PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008134_PyUnicode_TranslateCharmap(PyObject *input,
8135 PyObject *mapping,
8136 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008138 /* input object */
8139 char *idata;
8140 Py_ssize_t size, i;
8141 int kind;
8142 /* output buffer */
8143 Py_UCS4 *output = NULL;
8144 Py_ssize_t osize;
8145 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008146 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008147 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008148 char *reason = "character maps to <undefined>";
8149 PyObject *errorHandler = NULL;
8150 PyObject *exc = NULL;
8151 /* the following variable is used for caching string comparisons
8152 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8153 * 3=ignore, 4=xmlcharrefreplace */
8154 int known_errorHandler = -1;
8155
Guido van Rossumd57fd912000-03-10 22:53:23 +00008156 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008157 PyErr_BadArgument();
8158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008159 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 if (PyUnicode_READY(input) == -1)
8162 return NULL;
8163 idata = (char*)PyUnicode_DATA(input);
8164 kind = PyUnicode_KIND(input);
8165 size = PyUnicode_GET_LENGTH(input);
8166 i = 0;
8167
8168 if (size == 0) {
8169 Py_INCREF(input);
8170 return input;
8171 }
8172
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008173 /* allocate enough for a simple 1:1 translation without
8174 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008175 osize = size;
8176 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8177 opos = 0;
8178 if (output == NULL) {
8179 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008183 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 /* try to encode it */
8185 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186 if (charmaptranslate_output(input, i, mapping,
8187 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008188 Py_XDECREF(x);
8189 goto onError;
8190 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008191 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 else { /* untranslatable character */
8195 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8196 Py_ssize_t repsize;
8197 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008198 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008200 Py_ssize_t collstart = i;
8201 Py_ssize_t collend = i+1;
8202 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008203
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 while (collend < size) {
8206 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008207 goto onError;
8208 Py_XDECREF(x);
8209 if (x!=Py_None)
8210 break;
8211 ++collend;
8212 }
8213 /* cache callback name lookup
8214 * (if not done yet, i.e. it's the first error) */
8215 if (known_errorHandler==-1) {
8216 if ((errors==NULL) || (!strcmp(errors, "strict")))
8217 known_errorHandler = 1;
8218 else if (!strcmp(errors, "replace"))
8219 known_errorHandler = 2;
8220 else if (!strcmp(errors, "ignore"))
8221 known_errorHandler = 3;
8222 else if (!strcmp(errors, "xmlcharrefreplace"))
8223 known_errorHandler = 4;
8224 else
8225 known_errorHandler = 0;
8226 }
8227 switch (known_errorHandler) {
8228 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008229 raise_translate_exception(&exc, input, collstart,
8230 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008231 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008232 case 2: /* replace */
8233 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008234 for (coll = collstart; coll<collend; coll++)
8235 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 /* fall through */
8237 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008238 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008239 break;
8240 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008241 /* generate replacement (temporarily (mis)uses i) */
8242 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008243 char buffer[2+29+1+1];
8244 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008245 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8246 if (charmaptranslate_makespace(&output, &osize,
8247 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008248 goto onError;
8249 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008250 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008252 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 break;
8254 default:
8255 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008256 reason, input, &exc,
8257 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008258 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 goto onError;
8260 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008261 repsize = PyUnicode_GET_LENGTH(repunicode);
8262 if (charmaptranslate_makespace(&output, &osize,
8263 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008264 Py_DECREF(repunicode);
8265 goto onError;
8266 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 for (uni2 = 0; repsize-->0; ++uni2)
8268 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8269 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008271 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008272 }
8273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008274 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8275 if (!res)
8276 goto onError;
8277 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 Py_XDECREF(exc);
8279 Py_XDECREF(errorHandler);
8280 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008283 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008284 Py_XDECREF(exc);
8285 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286 return NULL;
8287}
8288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008289/* Deprecated. Use PyUnicode_Translate instead. */
8290PyObject *
8291PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8292 Py_ssize_t size,
8293 PyObject *mapping,
8294 const char *errors)
8295{
8296 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8297 if (!unicode)
8298 return NULL;
8299 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8300}
8301
Alexander Belopolsky40018472011-02-26 01:02:56 +00008302PyObject *
8303PyUnicode_Translate(PyObject *str,
8304 PyObject *mapping,
8305 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306{
8307 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008308
Guido van Rossumd57fd912000-03-10 22:53:23 +00008309 str = PyUnicode_FromObject(str);
8310 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 Py_DECREF(str);
8314 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008315
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008317 Py_XDECREF(str);
8318 return NULL;
8319}
Tim Petersced69f82003-09-16 20:30:58 +00008320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008321static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008322fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008323{
8324 /* No need to call PyUnicode_READY(self) because this function is only
8325 called as a callback from fixup() which does it already. */
8326 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8327 const int kind = PyUnicode_KIND(self);
8328 void *data = PyUnicode_DATA(self);
8329 Py_UCS4 maxchar = 0, ch, fixed;
8330 Py_ssize_t i;
8331
8332 for (i = 0; i < len; ++i) {
8333 ch = PyUnicode_READ(kind, data, i);
8334 fixed = 0;
8335 if (ch > 127) {
8336 if (Py_UNICODE_ISSPACE(ch))
8337 fixed = ' ';
8338 else {
8339 const int decimal = Py_UNICODE_TODECIMAL(ch);
8340 if (decimal >= 0)
8341 fixed = '0' + decimal;
8342 }
8343 if (fixed != 0) {
8344 if (fixed > maxchar)
8345 maxchar = fixed;
8346 PyUnicode_WRITE(kind, data, i, fixed);
8347 }
8348 else if (ch > maxchar)
8349 maxchar = ch;
8350 }
8351 else if (ch > maxchar)
8352 maxchar = ch;
8353 }
8354
8355 return maxchar;
8356}
8357
8358PyObject *
8359_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8360{
8361 if (!PyUnicode_Check(unicode)) {
8362 PyErr_BadInternalCall();
8363 return NULL;
8364 }
8365 if (PyUnicode_READY(unicode) == -1)
8366 return NULL;
8367 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8368 /* If the string is already ASCII, just return the same string */
8369 Py_INCREF(unicode);
8370 return unicode;
8371 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008372 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008373}
8374
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008375PyObject *
8376PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8377 Py_ssize_t length)
8378{
8379 PyObject *result;
8380 Py_UNICODE *p; /* write pointer into result */
8381 Py_ssize_t i;
8382 /* Copy to a new string */
8383 result = (PyObject *)_PyUnicode_New(length);
8384 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8385 if (result == NULL)
8386 return result;
8387 p = PyUnicode_AS_UNICODE(result);
8388 /* Iterate over code points */
8389 for (i = 0; i < length; i++) {
8390 Py_UNICODE ch =s[i];
8391 if (ch > 127) {
8392 int decimal = Py_UNICODE_TODECIMAL(ch);
8393 if (decimal >= 0)
8394 p[i] = '0' + decimal;
8395 }
8396 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008397#ifndef DONT_MAKE_RESULT_READY
8398 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008399 Py_DECREF(result);
8400 return NULL;
8401 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008402#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008403 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008404 return result;
8405}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008406/* --- Decimal Encoder ---------------------------------------------------- */
8407
Alexander Belopolsky40018472011-02-26 01:02:56 +00008408int
8409PyUnicode_EncodeDecimal(Py_UNICODE *s,
8410 Py_ssize_t length,
8411 char *output,
8412 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008413{
8414 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 PyObject *errorHandler = NULL;
8416 PyObject *exc = NULL;
8417 const char *encoding = "decimal";
8418 const char *reason = "invalid decimal Unicode string";
8419 /* the following variable is used for caching string comparisons
8420 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8421 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008422
8423 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 PyErr_BadArgument();
8425 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008426 }
8427
8428 p = s;
8429 end = s + length;
8430 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 register Py_UNICODE ch = *p;
8432 int decimal;
8433 PyObject *repunicode;
8434 Py_ssize_t repsize;
8435 Py_ssize_t newpos;
8436 Py_UNICODE *uni2;
8437 Py_UNICODE *collstart;
8438 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008439
Benjamin Peterson29060642009-01-31 22:14:21 +00008440 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008441 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 ++p;
8443 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 decimal = Py_UNICODE_TODECIMAL(ch);
8446 if (decimal >= 0) {
8447 *output++ = '0' + decimal;
8448 ++p;
8449 continue;
8450 }
8451 if (0 < ch && ch < 256) {
8452 *output++ = (char)ch;
8453 ++p;
8454 continue;
8455 }
8456 /* All other characters are considered unencodable */
8457 collstart = p;
8458 collend = p+1;
8459 while (collend < end) {
8460 if ((0 < *collend && *collend < 256) ||
8461 !Py_UNICODE_ISSPACE(*collend) ||
8462 Py_UNICODE_TODECIMAL(*collend))
8463 break;
8464 }
8465 /* cache callback name lookup
8466 * (if not done yet, i.e. it's the first error) */
8467 if (known_errorHandler==-1) {
8468 if ((errors==NULL) || (!strcmp(errors, "strict")))
8469 known_errorHandler = 1;
8470 else if (!strcmp(errors, "replace"))
8471 known_errorHandler = 2;
8472 else if (!strcmp(errors, "ignore"))
8473 known_errorHandler = 3;
8474 else if (!strcmp(errors, "xmlcharrefreplace"))
8475 known_errorHandler = 4;
8476 else
8477 known_errorHandler = 0;
8478 }
8479 switch (known_errorHandler) {
8480 case 1: /* strict */
8481 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8482 goto onError;
8483 case 2: /* replace */
8484 for (p = collstart; p < collend; ++p)
8485 *output++ = '?';
8486 /* fall through */
8487 case 3: /* ignore */
8488 p = collend;
8489 break;
8490 case 4: /* xmlcharrefreplace */
8491 /* generate replacement (temporarily (mis)uses p) */
8492 for (p = collstart; p < collend; ++p)
8493 output += sprintf(output, "&#%d;", (int)*p);
8494 p = collend;
8495 break;
8496 default:
8497 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8498 encoding, reason, s, length, &exc,
8499 collstart-s, collend-s, &newpos);
8500 if (repunicode == NULL)
8501 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008502 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008503 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008504 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8505 Py_DECREF(repunicode);
8506 goto onError;
8507 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 /* generate replacement */
8509 repsize = PyUnicode_GET_SIZE(repunicode);
8510 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8511 Py_UNICODE ch = *uni2;
8512 if (Py_UNICODE_ISSPACE(ch))
8513 *output++ = ' ';
8514 else {
8515 decimal = Py_UNICODE_TODECIMAL(ch);
8516 if (decimal >= 0)
8517 *output++ = '0' + decimal;
8518 else if (0 < ch && ch < 256)
8519 *output++ = (char)ch;
8520 else {
8521 Py_DECREF(repunicode);
8522 raise_encode_exception(&exc, encoding,
8523 s, length, collstart-s, collend-s, reason);
8524 goto onError;
8525 }
8526 }
8527 }
8528 p = s + newpos;
8529 Py_DECREF(repunicode);
8530 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008531 }
8532 /* 0-terminate the output string */
8533 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 Py_XDECREF(exc);
8535 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008536 return 0;
8537
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008539 Py_XDECREF(exc);
8540 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008541 return -1;
8542}
8543
Guido van Rossumd57fd912000-03-10 22:53:23 +00008544/* --- Helpers ------------------------------------------------------------ */
8545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008547any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 Py_ssize_t start,
8549 Py_ssize_t end)
8550{
8551 int kind1, kind2, kind;
8552 void *buf1, *buf2;
8553 Py_ssize_t len1, len2, result;
8554
8555 kind1 = PyUnicode_KIND(s1);
8556 kind2 = PyUnicode_KIND(s2);
8557 kind = kind1 > kind2 ? kind1 : kind2;
8558 buf1 = PyUnicode_DATA(s1);
8559 buf2 = PyUnicode_DATA(s2);
8560 if (kind1 != kind)
8561 buf1 = _PyUnicode_AsKind(s1, kind);
8562 if (!buf1)
8563 return -2;
8564 if (kind2 != kind)
8565 buf2 = _PyUnicode_AsKind(s2, kind);
8566 if (!buf2) {
8567 if (kind1 != kind) PyMem_Free(buf1);
8568 return -2;
8569 }
8570 len1 = PyUnicode_GET_LENGTH(s1);
8571 len2 = PyUnicode_GET_LENGTH(s2);
8572
Victor Stinner794d5672011-10-10 03:21:36 +02008573 if (direction > 0) {
8574 switch(kind) {
8575 case PyUnicode_1BYTE_KIND:
8576 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8577 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8578 else
8579 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8580 break;
8581 case PyUnicode_2BYTE_KIND:
8582 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8583 break;
8584 case PyUnicode_4BYTE_KIND:
8585 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8586 break;
8587 default:
8588 assert(0); result = -2;
8589 }
8590 }
8591 else {
8592 switch(kind) {
8593 case PyUnicode_1BYTE_KIND:
8594 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8595 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8596 else
8597 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8598 break;
8599 case PyUnicode_2BYTE_KIND:
8600 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8601 break;
8602 case PyUnicode_4BYTE_KIND:
8603 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8604 break;
8605 default:
8606 assert(0); result = -2;
8607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008608 }
8609
8610 if (kind1 != kind)
8611 PyMem_Free(buf1);
8612 if (kind2 != kind)
8613 PyMem_Free(buf2);
8614
8615 return result;
8616}
8617
8618Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008619_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008620 Py_ssize_t n_buffer,
8621 void *digits, Py_ssize_t n_digits,
8622 Py_ssize_t min_width,
8623 const char *grouping,
8624 const char *thousands_sep)
8625{
8626 switch(kind) {
8627 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008628 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8629 return _PyUnicode_ascii_InsertThousandsGrouping(
8630 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8631 min_width, grouping, thousands_sep);
8632 else
8633 return _PyUnicode_ucs1_InsertThousandsGrouping(
8634 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8635 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 case PyUnicode_2BYTE_KIND:
8637 return _PyUnicode_ucs2_InsertThousandsGrouping(
8638 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8639 min_width, grouping, thousands_sep);
8640 case PyUnicode_4BYTE_KIND:
8641 return _PyUnicode_ucs4_InsertThousandsGrouping(
8642 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8643 min_width, grouping, thousands_sep);
8644 }
8645 assert(0);
8646 return -1;
8647}
8648
8649
Thomas Wouters477c8d52006-05-27 19:21:47 +00008650/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008651#define ADJUST_INDICES(start, end, len) \
8652 if (end > len) \
8653 end = len; \
8654 else if (end < 0) { \
8655 end += len; \
8656 if (end < 0) \
8657 end = 0; \
8658 } \
8659 if (start < 0) { \
8660 start += len; \
8661 if (start < 0) \
8662 start = 0; \
8663 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008664
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665Py_ssize_t
8666PyUnicode_Count(PyObject *str,
8667 PyObject *substr,
8668 Py_ssize_t start,
8669 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008671 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008672 PyUnicodeObject* str_obj;
8673 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 int kind1, kind2, kind;
8675 void *buf1 = NULL, *buf2 = NULL;
8676 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008677
Thomas Wouters477c8d52006-05-27 19:21:47 +00008678 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008679 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008681 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008682 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 Py_DECREF(str_obj);
8684 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 }
Tim Petersced69f82003-09-16 20:30:58 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 kind1 = PyUnicode_KIND(str_obj);
8688 kind2 = PyUnicode_KIND(sub_obj);
8689 kind = kind1 > kind2 ? kind1 : kind2;
8690 buf1 = PyUnicode_DATA(str_obj);
8691 if (kind1 != kind)
8692 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8693 if (!buf1)
8694 goto onError;
8695 buf2 = PyUnicode_DATA(sub_obj);
8696 if (kind2 != kind)
8697 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8698 if (!buf2)
8699 goto onError;
8700 len1 = PyUnicode_GET_LENGTH(str_obj);
8701 len2 = PyUnicode_GET_LENGTH(sub_obj);
8702
8703 ADJUST_INDICES(start, end, len1);
8704 switch(kind) {
8705 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008706 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8707 result = asciilib_count(
8708 ((Py_UCS1*)buf1) + start, end - start,
8709 buf2, len2, PY_SSIZE_T_MAX
8710 );
8711 else
8712 result = ucs1lib_count(
8713 ((Py_UCS1*)buf1) + start, end - start,
8714 buf2, len2, PY_SSIZE_T_MAX
8715 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 break;
8717 case PyUnicode_2BYTE_KIND:
8718 result = ucs2lib_count(
8719 ((Py_UCS2*)buf1) + start, end - start,
8720 buf2, len2, PY_SSIZE_T_MAX
8721 );
8722 break;
8723 case PyUnicode_4BYTE_KIND:
8724 result = ucs4lib_count(
8725 ((Py_UCS4*)buf1) + start, end - start,
8726 buf2, len2, PY_SSIZE_T_MAX
8727 );
8728 break;
8729 default:
8730 assert(0); result = 0;
8731 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008732
8733 Py_DECREF(sub_obj);
8734 Py_DECREF(str_obj);
8735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008736 if (kind1 != kind)
8737 PyMem_Free(buf1);
8738 if (kind2 != kind)
8739 PyMem_Free(buf2);
8740
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008742 onError:
8743 Py_DECREF(sub_obj);
8744 Py_DECREF(str_obj);
8745 if (kind1 != kind && buf1)
8746 PyMem_Free(buf1);
8747 if (kind2 != kind && buf2)
8748 PyMem_Free(buf2);
8749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008750}
8751
Alexander Belopolsky40018472011-02-26 01:02:56 +00008752Py_ssize_t
8753PyUnicode_Find(PyObject *str,
8754 PyObject *sub,
8755 Py_ssize_t start,
8756 Py_ssize_t end,
8757 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008758{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008759 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008760
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008762 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008764 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008765 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 Py_DECREF(str);
8767 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 }
Tim Petersced69f82003-09-16 20:30:58 +00008769
Victor Stinner794d5672011-10-10 03:21:36 +02008770 result = any_find_slice(direction,
8771 str, sub, start, end
8772 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00008773
Guido van Rossumd57fd912000-03-10 22:53:23 +00008774 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008775 Py_DECREF(sub);
8776
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 return result;
8778}
8779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780Py_ssize_t
8781PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8782 Py_ssize_t start, Py_ssize_t end,
8783 int direction)
8784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008785 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008786 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 if (PyUnicode_READY(str) == -1)
8788 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008789 if (start < 0 || end < 0) {
8790 PyErr_SetString(PyExc_IndexError, "string index out of range");
8791 return -2;
8792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793 if (end > PyUnicode_GET_LENGTH(str))
8794 end = PyUnicode_GET_LENGTH(str);
8795 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008796 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
8797 kind, end-start, ch, direction);
8798 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02008800 else
8801 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802}
8803
Alexander Belopolsky40018472011-02-26 01:02:56 +00008804static int
8805tailmatch(PyUnicodeObject *self,
8806 PyUnicodeObject *substring,
8807 Py_ssize_t start,
8808 Py_ssize_t end,
8809 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 int kind_self;
8812 int kind_sub;
8813 void *data_self;
8814 void *data_sub;
8815 Py_ssize_t offset;
8816 Py_ssize_t i;
8817 Py_ssize_t end_sub;
8818
8819 if (PyUnicode_READY(self) == -1 ||
8820 PyUnicode_READY(substring) == -1)
8821 return 0;
8822
8823 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008824 return 1;
8825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8827 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008828 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008829 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008831 kind_self = PyUnicode_KIND(self);
8832 data_self = PyUnicode_DATA(self);
8833 kind_sub = PyUnicode_KIND(substring);
8834 data_sub = PyUnicode_DATA(substring);
8835 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8836
8837 if (direction > 0)
8838 offset = end;
8839 else
8840 offset = start;
8841
8842 if (PyUnicode_READ(kind_self, data_self, offset) ==
8843 PyUnicode_READ(kind_sub, data_sub, 0) &&
8844 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8845 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8846 /* If both are of the same kind, memcmp is sufficient */
8847 if (kind_self == kind_sub) {
8848 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008849 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008850 data_sub,
8851 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008852 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008853 }
8854 /* otherwise we have to compare each character by first accesing it */
8855 else {
8856 /* We do not need to compare 0 and len(substring)-1 because
8857 the if statement above ensured already that they are equal
8858 when we end up here. */
8859 // TODO: honor direction and do a forward or backwards search
8860 for (i = 1; i < end_sub; ++i) {
8861 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8862 PyUnicode_READ(kind_sub, data_sub, i))
8863 return 0;
8864 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008865 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008867 }
8868
8869 return 0;
8870}
8871
Alexander Belopolsky40018472011-02-26 01:02:56 +00008872Py_ssize_t
8873PyUnicode_Tailmatch(PyObject *str,
8874 PyObject *substr,
8875 Py_ssize_t start,
8876 Py_ssize_t end,
8877 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008878{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008879 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008880
Guido van Rossumd57fd912000-03-10 22:53:23 +00008881 str = PyUnicode_FromObject(str);
8882 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008884 substr = PyUnicode_FromObject(substr);
8885 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 Py_DECREF(str);
8887 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008888 }
Tim Petersced69f82003-09-16 20:30:58 +00008889
Guido van Rossumd57fd912000-03-10 22:53:23 +00008890 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 (PyUnicodeObject *)substr,
8892 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008893 Py_DECREF(str);
8894 Py_DECREF(substr);
8895 return result;
8896}
8897
Guido van Rossumd57fd912000-03-10 22:53:23 +00008898/* Apply fixfct filter to the Unicode object self and return a
8899 reference to the modified object */
8900
Alexander Belopolsky40018472011-02-26 01:02:56 +00008901static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008902fixup(PyObject *self,
8903 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008905 PyObject *u;
8906 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008908 if (PyUnicode_READY(self) == -1)
8909 return NULL;
8910 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8911 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8912 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008914 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008915
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008916 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02008917 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008919 /* fix functions return the new maximum character in a string,
8920 if the kind of the resulting unicode object does not change,
8921 everything is fine. Otherwise we need to change the string kind
8922 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008923 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 if (maxchar_new == 0)
8925 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8926 else if (maxchar_new <= 127)
8927 maxchar_new = 127;
8928 else if (maxchar_new <= 255)
8929 maxchar_new = 255;
8930 else if (maxchar_new <= 65535)
8931 maxchar_new = 65535;
8932 else
8933 maxchar_new = 1114111; /* 0x10ffff */
8934
8935 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008936 /* fixfct should return TRUE if it modified the buffer. If
8937 FALSE, return a reference to the original buffer instead
8938 (to save space, not time) */
8939 Py_INCREF(self);
8940 Py_DECREF(u);
8941 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008943 else if (maxchar_new == maxchar_old) {
8944 return u;
8945 }
8946 else {
8947 /* In case the maximum character changed, we need to
8948 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008949 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 if (v == NULL) {
8951 Py_DECREF(u);
8952 return NULL;
8953 }
8954 if (maxchar_new > maxchar_old) {
8955 /* If the maxchar increased so that the kind changed, not all
8956 characters are representable anymore and we need to fix the
8957 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008958 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02008959 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8961 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008962 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02008963 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965
8966 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008967 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008968 return v;
8969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970}
8971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008972static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008973fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 /* No need to call PyUnicode_READY(self) because this function is only
8976 called as a callback from fixup() which does it already. */
8977 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8978 const int kind = PyUnicode_KIND(self);
8979 void *data = PyUnicode_DATA(self);
8980 int touched = 0;
8981 Py_UCS4 maxchar = 0;
8982 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008983
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984 for (i = 0; i < len; ++i) {
8985 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8986 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8987 if (up != ch) {
8988 if (up > maxchar)
8989 maxchar = up;
8990 PyUnicode_WRITE(kind, data, i, up);
8991 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993 else if (ch > maxchar)
8994 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008995 }
8996
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008997 if (touched)
8998 return maxchar;
8999 else
9000 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009001}
9002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009004fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9007 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9008 const int kind = PyUnicode_KIND(self);
9009 void *data = PyUnicode_DATA(self);
9010 int touched = 0;
9011 Py_UCS4 maxchar = 0;
9012 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 for(i = 0; i < len; ++i) {
9015 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9016 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9017 if (lo != ch) {
9018 if (lo > maxchar)
9019 maxchar = lo;
9020 PyUnicode_WRITE(kind, data, i, lo);
9021 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 else if (ch > maxchar)
9024 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009025 }
9026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009027 if (touched)
9028 return maxchar;
9029 else
9030 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009031}
9032
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009033static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009034fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9037 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9038 const int kind = PyUnicode_KIND(self);
9039 void *data = PyUnicode_DATA(self);
9040 int touched = 0;
9041 Py_UCS4 maxchar = 0;
9042 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009044 for(i = 0; i < len; ++i) {
9045 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9046 Py_UCS4 nu = 0;
9047
9048 if (Py_UNICODE_ISUPPER(ch))
9049 nu = Py_UNICODE_TOLOWER(ch);
9050 else if (Py_UNICODE_ISLOWER(ch))
9051 nu = Py_UNICODE_TOUPPER(ch);
9052
9053 if (nu != 0) {
9054 if (nu > maxchar)
9055 maxchar = nu;
9056 PyUnicode_WRITE(kind, data, i, nu);
9057 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009059 else if (ch > maxchar)
9060 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009061 }
9062
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 if (touched)
9064 return maxchar;
9065 else
9066 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009067}
9068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009070fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9073 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9074 const int kind = PyUnicode_KIND(self);
9075 void *data = PyUnicode_DATA(self);
9076 int touched = 0;
9077 Py_UCS4 maxchar = 0;
9078 Py_ssize_t i = 0;
9079 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009080
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009081 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009082 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083
9084 ch = PyUnicode_READ(kind, data, i);
9085 if (!Py_UNICODE_ISUPPER(ch)) {
9086 maxchar = Py_UNICODE_TOUPPER(ch);
9087 PyUnicode_WRITE(kind, data, i, maxchar);
9088 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 ++i;
9091 for(; i < len; ++i) {
9092 ch = PyUnicode_READ(kind, data, i);
9093 if (!Py_UNICODE_ISLOWER(ch)) {
9094 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9095 if (lo > maxchar)
9096 maxchar = lo;
9097 PyUnicode_WRITE(kind, data, i, lo);
9098 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100 else if (ch > maxchar)
9101 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103
9104 if (touched)
9105 return maxchar;
9106 else
9107 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108}
9109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009111fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009112{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009113 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9114 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9115 const int kind = PyUnicode_KIND(self);
9116 void *data = PyUnicode_DATA(self);
9117 Py_UCS4 maxchar = 0;
9118 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009119 int previous_is_cased;
9120
9121 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 if (len == 1) {
9123 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9124 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9125 if (ti != ch) {
9126 PyUnicode_WRITE(kind, data, i, ti);
9127 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009128 }
9129 else
9130 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009131 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009132 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009133 for(; i < len; ++i) {
9134 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9135 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009136
Benjamin Peterson29060642009-01-31 22:14:21 +00009137 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009140 nu = Py_UNICODE_TOTITLE(ch);
9141
9142 if (nu > maxchar)
9143 maxchar = nu;
9144 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009145
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 if (Py_UNICODE_ISLOWER(ch) ||
9147 Py_UNICODE_ISUPPER(ch) ||
9148 Py_UNICODE_ISTITLE(ch))
9149 previous_is_cased = 1;
9150 else
9151 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154}
9155
Tim Peters8ce9f162004-08-27 01:49:32 +00009156PyObject *
9157PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009159 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009160 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009162 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009163 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9164 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009165 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009166 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009167 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009169 int use_memcpy;
9170 unsigned char *res_data = NULL, *sep_data = NULL;
9171 PyObject *last_obj;
9172 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009173
Tim Peters05eba1f2004-08-27 21:32:02 +00009174 fseq = PySequence_Fast(seq, "");
9175 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009176 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009177 }
9178
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009179 /* NOTE: the following code can't call back into Python code,
9180 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009181 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009182
Tim Peters05eba1f2004-08-27 21:32:02 +00009183 seqlen = PySequence_Fast_GET_SIZE(fseq);
9184 /* If empty sequence, return u"". */
9185 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009186 Py_DECREF(fseq);
9187 Py_INCREF(unicode_empty);
9188 res = unicode_empty;
9189 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009190 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009191
Tim Peters05eba1f2004-08-27 21:32:02 +00009192 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009193 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009194 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009195 if (seqlen == 1) {
9196 if (PyUnicode_CheckExact(items[0])) {
9197 res = items[0];
9198 Py_INCREF(res);
9199 Py_DECREF(fseq);
9200 return res;
9201 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009202 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009203 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009204 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009205 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009206 /* Set up sep and seplen */
9207 if (separator == NULL) {
9208 /* fall back to a blank space separator */
9209 sep = PyUnicode_FromOrdinal(' ');
9210 if (!sep)
9211 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009212 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009213 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009214 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009215 else {
9216 if (!PyUnicode_Check(separator)) {
9217 PyErr_Format(PyExc_TypeError,
9218 "separator: expected str instance,"
9219 " %.80s found",
9220 Py_TYPE(separator)->tp_name);
9221 goto onError;
9222 }
9223 if (PyUnicode_READY(separator))
9224 goto onError;
9225 sep = separator;
9226 seplen = PyUnicode_GET_LENGTH(separator);
9227 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9228 /* inc refcount to keep this code path symmetric with the
9229 above case of a blank separator */
9230 Py_INCREF(sep);
9231 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009232 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009233 }
9234
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009235 /* There are at least two things to join, or else we have a subclass
9236 * of str in the sequence.
9237 * Do a pre-pass to figure out the total amount of space we'll
9238 * need (sz), and see whether all argument are strings.
9239 */
9240 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009241#ifdef Py_DEBUG
9242 use_memcpy = 0;
9243#else
9244 use_memcpy = 1;
9245#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009246 for (i = 0; i < seqlen; i++) {
9247 const Py_ssize_t old_sz = sz;
9248 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009249 if (!PyUnicode_Check(item)) {
9250 PyErr_Format(PyExc_TypeError,
9251 "sequence item %zd: expected str instance,"
9252 " %.80s found",
9253 i, Py_TYPE(item)->tp_name);
9254 goto onError;
9255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009256 if (PyUnicode_READY(item) == -1)
9257 goto onError;
9258 sz += PyUnicode_GET_LENGTH(item);
9259 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009260 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009261 if (i != 0)
9262 sz += seplen;
9263 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9264 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009265 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009266 goto onError;
9267 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009268 if (use_memcpy && last_obj != NULL) {
9269 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9270 use_memcpy = 0;
9271 }
9272 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009273 }
Tim Petersced69f82003-09-16 20:30:58 +00009274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009276 if (res == NULL)
9277 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009278
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009279 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009280#ifdef Py_DEBUG
9281 use_memcpy = 0;
9282#else
9283 if (use_memcpy) {
9284 res_data = PyUnicode_1BYTE_DATA(res);
9285 kind = PyUnicode_KIND(res);
9286 if (seplen != 0)
9287 sep_data = PyUnicode_1BYTE_DATA(sep);
9288 }
9289#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009291 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009292 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009293 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009294 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009295 if (use_memcpy) {
9296 Py_MEMCPY(res_data,
9297 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009298 kind * seplen);
9299 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009300 }
9301 else {
9302 copy_characters(res, res_offset, sep, 0, seplen);
9303 res_offset += seplen;
9304 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009306 itemlen = PyUnicode_GET_LENGTH(item);
9307 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009308 if (use_memcpy) {
9309 Py_MEMCPY(res_data,
9310 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009311 kind * itemlen);
9312 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009313 }
9314 else {
9315 copy_characters(res, res_offset, item, 0, itemlen);
9316 res_offset += itemlen;
9317 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009318 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009319 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009320 if (use_memcpy)
9321 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009322 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009323 else
9324 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009325
Tim Peters05eba1f2004-08-27 21:32:02 +00009326 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009328 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009330
Benjamin Peterson29060642009-01-31 22:14:21 +00009331 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009332 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009333 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009334 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009335 return NULL;
9336}
9337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338#define FILL(kind, data, value, start, length) \
9339 do { \
9340 Py_ssize_t i_ = 0; \
9341 assert(kind != PyUnicode_WCHAR_KIND); \
9342 switch ((kind)) { \
9343 case PyUnicode_1BYTE_KIND: { \
9344 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9345 memset(to_, (unsigned char)value, length); \
9346 break; \
9347 } \
9348 case PyUnicode_2BYTE_KIND: { \
9349 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9350 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9351 break; \
9352 } \
9353 default: { \
9354 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9355 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9356 break; \
9357 } \
9358 } \
9359 } while (0)
9360
Victor Stinner9310abb2011-10-05 00:59:23 +02009361static PyObject *
9362pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009363 Py_ssize_t left,
9364 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 PyObject *u;
9368 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009369 int kind;
9370 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371
9372 if (left < 0)
9373 left = 0;
9374 if (right < 0)
9375 right = 0;
9376
Tim Peters7a29bd52001-09-12 03:03:31 +00009377 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 Py_INCREF(self);
9379 return self;
9380 }
9381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9383 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009384 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9385 return NULL;
9386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9388 if (fill > maxchar)
9389 maxchar = fill;
9390 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009391 if (!u)
9392 return NULL;
9393
9394 kind = PyUnicode_KIND(u);
9395 data = PyUnicode_DATA(u);
9396 if (left)
9397 FILL(kind, data, fill, 0, left);
9398 if (right)
9399 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009400 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009401 assert(_PyUnicode_CheckConsistency(u, 1));
9402 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009404#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009405
Alexander Belopolsky40018472011-02-26 01:02:56 +00009406PyObject *
9407PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009409 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009410
9411 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009412 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 switch(PyUnicode_KIND(string)) {
9416 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009417 if (PyUnicode_IS_ASCII(string))
9418 list = asciilib_splitlines(
9419 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9420 PyUnicode_GET_LENGTH(string), keepends);
9421 else
9422 list = ucs1lib_splitlines(
9423 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9424 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425 break;
9426 case PyUnicode_2BYTE_KIND:
9427 list = ucs2lib_splitlines(
9428 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9429 PyUnicode_GET_LENGTH(string), keepends);
9430 break;
9431 case PyUnicode_4BYTE_KIND:
9432 list = ucs4lib_splitlines(
9433 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9434 PyUnicode_GET_LENGTH(string), keepends);
9435 break;
9436 default:
9437 assert(0);
9438 list = 0;
9439 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440 Py_DECREF(string);
9441 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442}
9443
Alexander Belopolsky40018472011-02-26 01:02:56 +00009444static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009445split(PyObject *self,
9446 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009447 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009449 int kind1, kind2, kind;
9450 void *buf1, *buf2;
9451 Py_ssize_t len1, len2;
9452 PyObject* out;
9453
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009455 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 if (PyUnicode_READY(self) == -1)
9458 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 if (substring == NULL)
9461 switch(PyUnicode_KIND(self)) {
9462 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009463 if (PyUnicode_IS_ASCII(self))
9464 return asciilib_split_whitespace(
9465 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9466 PyUnicode_GET_LENGTH(self), maxcount
9467 );
9468 else
9469 return ucs1lib_split_whitespace(
9470 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9471 PyUnicode_GET_LENGTH(self), maxcount
9472 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473 case PyUnicode_2BYTE_KIND:
9474 return ucs2lib_split_whitespace(
9475 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9476 PyUnicode_GET_LENGTH(self), maxcount
9477 );
9478 case PyUnicode_4BYTE_KIND:
9479 return ucs4lib_split_whitespace(
9480 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9481 PyUnicode_GET_LENGTH(self), maxcount
9482 );
9483 default:
9484 assert(0);
9485 return NULL;
9486 }
9487
9488 if (PyUnicode_READY(substring) == -1)
9489 return NULL;
9490
9491 kind1 = PyUnicode_KIND(self);
9492 kind2 = PyUnicode_KIND(substring);
9493 kind = kind1 > kind2 ? kind1 : kind2;
9494 buf1 = PyUnicode_DATA(self);
9495 buf2 = PyUnicode_DATA(substring);
9496 if (kind1 != kind)
9497 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9498 if (!buf1)
9499 return NULL;
9500 if (kind2 != kind)
9501 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9502 if (!buf2) {
9503 if (kind1 != kind) PyMem_Free(buf1);
9504 return NULL;
9505 }
9506 len1 = PyUnicode_GET_LENGTH(self);
9507 len2 = PyUnicode_GET_LENGTH(substring);
9508
9509 switch(kind) {
9510 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009511 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9512 out = asciilib_split(
9513 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9514 else
9515 out = ucs1lib_split(
9516 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 break;
9518 case PyUnicode_2BYTE_KIND:
9519 out = ucs2lib_split(
9520 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9521 break;
9522 case PyUnicode_4BYTE_KIND:
9523 out = ucs4lib_split(
9524 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9525 break;
9526 default:
9527 out = NULL;
9528 }
9529 if (kind1 != kind)
9530 PyMem_Free(buf1);
9531 if (kind2 != kind)
9532 PyMem_Free(buf2);
9533 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009534}
9535
Alexander Belopolsky40018472011-02-26 01:02:56 +00009536static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009537rsplit(PyObject *self,
9538 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009539 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 int kind1, kind2, kind;
9542 void *buf1, *buf2;
9543 Py_ssize_t len1, len2;
9544 PyObject* out;
9545
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009546 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009547 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009549 if (PyUnicode_READY(self) == -1)
9550 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 if (substring == NULL)
9553 switch(PyUnicode_KIND(self)) {
9554 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009555 if (PyUnicode_IS_ASCII(self))
9556 return asciilib_rsplit_whitespace(
9557 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9558 PyUnicode_GET_LENGTH(self), maxcount
9559 );
9560 else
9561 return ucs1lib_rsplit_whitespace(
9562 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9563 PyUnicode_GET_LENGTH(self), maxcount
9564 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565 case PyUnicode_2BYTE_KIND:
9566 return ucs2lib_rsplit_whitespace(
9567 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9568 PyUnicode_GET_LENGTH(self), maxcount
9569 );
9570 case PyUnicode_4BYTE_KIND:
9571 return ucs4lib_rsplit_whitespace(
9572 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9573 PyUnicode_GET_LENGTH(self), maxcount
9574 );
9575 default:
9576 assert(0);
9577 return NULL;
9578 }
9579
9580 if (PyUnicode_READY(substring) == -1)
9581 return NULL;
9582
9583 kind1 = PyUnicode_KIND(self);
9584 kind2 = PyUnicode_KIND(substring);
9585 kind = kind1 > kind2 ? kind1 : kind2;
9586 buf1 = PyUnicode_DATA(self);
9587 buf2 = PyUnicode_DATA(substring);
9588 if (kind1 != kind)
9589 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9590 if (!buf1)
9591 return NULL;
9592 if (kind2 != kind)
9593 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9594 if (!buf2) {
9595 if (kind1 != kind) PyMem_Free(buf1);
9596 return NULL;
9597 }
9598 len1 = PyUnicode_GET_LENGTH(self);
9599 len2 = PyUnicode_GET_LENGTH(substring);
9600
9601 switch(kind) {
9602 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009603 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9604 out = asciilib_rsplit(
9605 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9606 else
9607 out = ucs1lib_rsplit(
9608 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 break;
9610 case PyUnicode_2BYTE_KIND:
9611 out = ucs2lib_rsplit(
9612 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9613 break;
9614 case PyUnicode_4BYTE_KIND:
9615 out = ucs4lib_rsplit(
9616 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9617 break;
9618 default:
9619 out = NULL;
9620 }
9621 if (kind1 != kind)
9622 PyMem_Free(buf1);
9623 if (kind2 != kind)
9624 PyMem_Free(buf2);
9625 return out;
9626}
9627
9628static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009629anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9630 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631{
9632 switch(kind) {
9633 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009634 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9635 return asciilib_find(buf1, len1, buf2, len2, offset);
9636 else
9637 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 case PyUnicode_2BYTE_KIND:
9639 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9640 case PyUnicode_4BYTE_KIND:
9641 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9642 }
9643 assert(0);
9644 return -1;
9645}
9646
9647static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009648anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9649 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650{
9651 switch(kind) {
9652 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009653 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9654 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9655 else
9656 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 case PyUnicode_2BYTE_KIND:
9658 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9659 case PyUnicode_4BYTE_KIND:
9660 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9661 }
9662 assert(0);
9663 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009664}
9665
Alexander Belopolsky40018472011-02-26 01:02:56 +00009666static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667replace(PyObject *self, PyObject *str1,
9668 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 PyObject *u;
9671 char *sbuf = PyUnicode_DATA(self);
9672 char *buf1 = PyUnicode_DATA(str1);
9673 char *buf2 = PyUnicode_DATA(str2);
9674 int srelease = 0, release1 = 0, release2 = 0;
9675 int skind = PyUnicode_KIND(self);
9676 int kind1 = PyUnicode_KIND(str1);
9677 int kind2 = PyUnicode_KIND(str2);
9678 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9679 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9680 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +02009681 int mayshrink;
9682 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683
9684 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009685 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009687 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688
Victor Stinner59de0ee2011-10-07 10:01:28 +02009689 if (str1 == str2)
9690 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 if (skind < kind1)
9692 /* substring too wide to be present */
9693 goto nothing;
9694
Victor Stinner49a0a212011-10-12 23:46:10 +02009695 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9696 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
9697 /* Replacing str1 with str2 may cause a maxchar reduction in the
9698 result string. */
9699 mayshrink = (maxchar_str2 < maxchar);
9700 maxchar = Py_MAX(maxchar, maxchar_str2);
9701
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009702 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009703 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009704 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009706 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009708 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +02009709 Py_UCS4 u1, u2;
9710 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009712 if (findchar(sbuf, PyUnicode_KIND(self),
9713 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009714 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009715 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009717 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009718 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009719 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720 rkind = PyUnicode_KIND(u);
9721 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9722 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009723 if (--maxcount < 0)
9724 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009726 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009727 }
9728 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 int rkind = skind;
9730 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +02009731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009732 if (kind1 < rkind) {
9733 /* widen substring */
9734 buf1 = _PyUnicode_AsKind(str1, rkind);
9735 if (!buf1) goto error;
9736 release1 = 1;
9737 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009738 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009739 if (i < 0)
9740 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 if (rkind > kind2) {
9742 /* widen replacement */
9743 buf2 = _PyUnicode_AsKind(str2, rkind);
9744 if (!buf2) goto error;
9745 release2 = 1;
9746 }
9747 else if (rkind < kind2) {
9748 /* widen self and buf1 */
9749 rkind = kind2;
9750 if (release1) PyMem_Free(buf1);
9751 sbuf = _PyUnicode_AsKind(self, rkind);
9752 if (!sbuf) goto error;
9753 srelease = 1;
9754 buf1 = _PyUnicode_AsKind(str1, rkind);
9755 if (!buf1) goto error;
9756 release1 = 1;
9757 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009758 u = PyUnicode_New(slen, maxchar);
9759 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009760 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009761 assert(PyUnicode_KIND(u) == rkind);
9762 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +02009763
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009764 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009765 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009766 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009767 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009768 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009769 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009770
9771 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009772 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009773 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009774 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009775 if (i == -1)
9776 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009779 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009783 }
9784 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009785 Py_ssize_t n, i, j, ires;
9786 Py_ssize_t product, new_size;
9787 int rkind = skind;
9788 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009790 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009791 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009792 buf1 = _PyUnicode_AsKind(str1, rkind);
9793 if (!buf1) goto error;
9794 release1 = 1;
9795 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009796 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009797 if (n == 0)
9798 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009800 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009801 buf2 = _PyUnicode_AsKind(str2, rkind);
9802 if (!buf2) goto error;
9803 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +02009806 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009807 rkind = kind2;
9808 sbuf = _PyUnicode_AsKind(self, rkind);
9809 if (!sbuf) goto error;
9810 srelease = 1;
9811 if (release1) PyMem_Free(buf1);
9812 buf1 = _PyUnicode_AsKind(str1, rkind);
9813 if (!buf1) goto error;
9814 release1 = 1;
9815 }
9816 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9817 PyUnicode_GET_LENGTH(str1))); */
9818 product = n * (len2-len1);
9819 if ((product / (len2-len1)) != n) {
9820 PyErr_SetString(PyExc_OverflowError,
9821 "replace string is too long");
9822 goto error;
9823 }
9824 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +02009825 if (new_size == 0) {
9826 Py_INCREF(unicode_empty);
9827 u = unicode_empty;
9828 goto done;
9829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9831 PyErr_SetString(PyExc_OverflowError,
9832 "replace string is too long");
9833 goto error;
9834 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009835 u = PyUnicode_New(new_size, maxchar);
9836 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +02009838 assert(PyUnicode_KIND(u) == rkind);
9839 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 ires = i = 0;
9841 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009842 while (n-- > 0) {
9843 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009844 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009845 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009846 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009847 if (j == -1)
9848 break;
9849 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009850 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009851 memcpy(res + rkind * ires,
9852 sbuf + rkind * i,
9853 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009855 }
9856 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009858 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009860 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009861 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009862 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009864 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009865 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009866 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009867 memcpy(res + rkind * ires,
9868 sbuf + rkind * i,
9869 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +02009870 }
9871 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009872 /* interleave */
9873 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009874 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009876 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009877 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009878 if (--n <= 0)
9879 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009880 memcpy(res + rkind * ires,
9881 sbuf + rkind * i,
9882 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 ires++;
9884 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009885 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009886 memcpy(res + rkind * ires,
9887 sbuf + rkind * i,
9888 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009889 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009890 }
9891
9892 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +02009893 unicode_adjust_maxchar(&u);
9894 if (u == NULL)
9895 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009896 }
Victor Stinner49a0a212011-10-12 23:46:10 +02009897
9898 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009899 if (srelease)
9900 PyMem_FREE(sbuf);
9901 if (release1)
9902 PyMem_FREE(buf1);
9903 if (release2)
9904 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009905 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009907
Benjamin Peterson29060642009-01-31 22:14:21 +00009908 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009909 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910 if (srelease)
9911 PyMem_FREE(sbuf);
9912 if (release1)
9913 PyMem_FREE(buf1);
9914 if (release2)
9915 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009916 if (PyUnicode_CheckExact(self)) {
9917 Py_INCREF(self);
9918 return (PyObject *) self;
9919 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009920 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 error:
9922 if (srelease && sbuf)
9923 PyMem_FREE(sbuf);
9924 if (release1 && buf1)
9925 PyMem_FREE(buf1);
9926 if (release2 && buf2)
9927 PyMem_FREE(buf2);
9928 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929}
9930
9931/* --- Unicode Object Methods --------------------------------------------- */
9932
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009933PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009934 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009935\n\
9936Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009937characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009938
9939static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009940unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009941{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009942 return fixup(self, fixtitle);
9943}
9944
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009945PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009946 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009947\n\
9948Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009949have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
9951static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009952unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009953{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954 return fixup(self, fixcapitalize);
9955}
9956
9957#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009958PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009959 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960\n\
9961Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009962normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963
9964static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009965unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966{
9967 PyObject *list;
9968 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009969 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009970
Guido van Rossumd57fd912000-03-10 22:53:23 +00009971 /* Split into words */
9972 list = split(self, NULL, -1);
9973 if (!list)
9974 return NULL;
9975
9976 /* Capitalize each word */
9977 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9978 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009979 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980 if (item == NULL)
9981 goto onError;
9982 Py_DECREF(PyList_GET_ITEM(list, i));
9983 PyList_SET_ITEM(list, i, item);
9984 }
9985
9986 /* Join the words to form a new string */
9987 item = PyUnicode_Join(NULL, list);
9988
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009990 Py_DECREF(list);
9991 return (PyObject *)item;
9992}
9993#endif
9994
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009995/* Argument converter. Coerces to a single unicode character */
9996
9997static int
9998convert_uc(PyObject *obj, void *addr)
9999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010001 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010002
Benjamin Peterson14339b62009-01-31 16:36:08 +000010003 uniobj = PyUnicode_FromObject(obj);
10004 if (uniobj == NULL) {
10005 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010006 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010007 return 0;
10008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010010 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010011 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010012 Py_DECREF(uniobj);
10013 return 0;
10014 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010016 Py_DECREF(uniobj);
10017 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010018}
10019
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010020PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010021 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010023Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010024done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010025
10026static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010027unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010029 Py_ssize_t marg, left;
10030 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 Py_UCS4 fillchar = ' ';
10032
Victor Stinnere9a29352011-10-01 02:14:59 +020010033 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
Victor Stinnere9a29352011-10-01 02:14:59 +020010036 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037 return NULL;
10038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040 Py_INCREF(self);
10041 return (PyObject*) self;
10042 }
10043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010045 left = marg / 2 + (marg & width & 1);
10046
Victor Stinner9310abb2011-10-05 00:59:23 +020010047 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010048}
10049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010050/* This function assumes that str1 and str2 are readied by the caller. */
10051
Marc-André Lemburge5034372000-08-08 08:04:29 +000010052static int
10053unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10054{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 int kind1, kind2;
10056 void *data1, *data2;
10057 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010058
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010059 kind1 = PyUnicode_KIND(str1);
10060 kind2 = PyUnicode_KIND(str2);
10061 data1 = PyUnicode_DATA(str1);
10062 data2 = PyUnicode_DATA(str2);
10063 len1 = PyUnicode_GET_LENGTH(str1);
10064 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 for (i = 0; i < len1 && i < len2; ++i) {
10067 Py_UCS4 c1, c2;
10068 c1 = PyUnicode_READ(kind1, data1, i);
10069 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010070
10071 if (c1 != c2)
10072 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010073 }
10074
10075 return (len1 < len2) ? -1 : (len1 != len2);
10076}
10077
Alexander Belopolsky40018472011-02-26 01:02:56 +000010078int
10079PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10082 if (PyUnicode_READY(left) == -1 ||
10083 PyUnicode_READY(right) == -1)
10084 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010085 return unicode_compare((PyUnicodeObject *)left,
10086 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010087 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010088 PyErr_Format(PyExc_TypeError,
10089 "Can't compare %.100s and %.100s",
10090 left->ob_type->tp_name,
10091 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092 return -1;
10093}
10094
Martin v. Löwis5b222132007-06-10 09:51:05 +000010095int
10096PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010098 Py_ssize_t i;
10099 int kind;
10100 void *data;
10101 Py_UCS4 chr;
10102
Victor Stinner910337b2011-10-03 03:20:16 +020010103 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 if (PyUnicode_READY(uni) == -1)
10105 return -1;
10106 kind = PyUnicode_KIND(uni);
10107 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010108 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010109 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10110 if (chr != str[i])
10111 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010112 /* This check keeps Python strings that end in '\0' from comparing equal
10113 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010114 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010116 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010117 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010118 return 0;
10119}
10120
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010121
Benjamin Peterson29060642009-01-31 22:14:21 +000010122#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010123 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010124
Alexander Belopolsky40018472011-02-26 01:02:56 +000010125PyObject *
10126PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010127{
10128 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010129
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010130 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10131 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 if (PyUnicode_READY(left) == -1 ||
10133 PyUnicode_READY(right) == -1)
10134 return NULL;
10135 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10136 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010137 if (op == Py_EQ) {
10138 Py_INCREF(Py_False);
10139 return Py_False;
10140 }
10141 if (op == Py_NE) {
10142 Py_INCREF(Py_True);
10143 return Py_True;
10144 }
10145 }
10146 if (left == right)
10147 result = 0;
10148 else
10149 result = unicode_compare((PyUnicodeObject *)left,
10150 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010151
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010152 /* Convert the return value to a Boolean */
10153 switch (op) {
10154 case Py_EQ:
10155 v = TEST_COND(result == 0);
10156 break;
10157 case Py_NE:
10158 v = TEST_COND(result != 0);
10159 break;
10160 case Py_LE:
10161 v = TEST_COND(result <= 0);
10162 break;
10163 case Py_GE:
10164 v = TEST_COND(result >= 0);
10165 break;
10166 case Py_LT:
10167 v = TEST_COND(result == -1);
10168 break;
10169 case Py_GT:
10170 v = TEST_COND(result == 1);
10171 break;
10172 default:
10173 PyErr_BadArgument();
10174 return NULL;
10175 }
10176 Py_INCREF(v);
10177 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010178 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010179
Brian Curtindfc80e32011-08-10 20:28:54 -050010180 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010181}
10182
Alexander Belopolsky40018472011-02-26 01:02:56 +000010183int
10184PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010185{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010186 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 int kind1, kind2, kind;
10188 void *buf1, *buf2;
10189 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010190 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010191
10192 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010193 sub = PyUnicode_FromObject(element);
10194 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010195 PyErr_Format(PyExc_TypeError,
10196 "'in <string>' requires string as left operand, not %s",
10197 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010198 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 if (PyUnicode_READY(sub) == -1)
10201 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010202
Thomas Wouters477c8d52006-05-27 19:21:47 +000010203 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010204 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010205 Py_DECREF(sub);
10206 return -1;
10207 }
10208
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010209 kind1 = PyUnicode_KIND(str);
10210 kind2 = PyUnicode_KIND(sub);
10211 kind = kind1 > kind2 ? kind1 : kind2;
10212 buf1 = PyUnicode_DATA(str);
10213 buf2 = PyUnicode_DATA(sub);
10214 if (kind1 != kind)
10215 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10216 if (!buf1) {
10217 Py_DECREF(sub);
10218 return -1;
10219 }
10220 if (kind2 != kind)
10221 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10222 if (!buf2) {
10223 Py_DECREF(sub);
10224 if (kind1 != kind) PyMem_Free(buf1);
10225 return -1;
10226 }
10227 len1 = PyUnicode_GET_LENGTH(str);
10228 len2 = PyUnicode_GET_LENGTH(sub);
10229
10230 switch(kind) {
10231 case PyUnicode_1BYTE_KIND:
10232 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10233 break;
10234 case PyUnicode_2BYTE_KIND:
10235 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10236 break;
10237 case PyUnicode_4BYTE_KIND:
10238 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10239 break;
10240 default:
10241 result = -1;
10242 assert(0);
10243 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010244
10245 Py_DECREF(str);
10246 Py_DECREF(sub);
10247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (kind1 != kind)
10249 PyMem_Free(buf1);
10250 if (kind2 != kind)
10251 PyMem_Free(buf2);
10252
Guido van Rossum403d68b2000-03-13 15:55:09 +000010253 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010254}
10255
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256/* Concat to string or Unicode object giving a new Unicode object. */
10257
Alexander Belopolsky40018472011-02-26 01:02:56 +000010258PyObject *
10259PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010262 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010263
10264 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010267 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010269 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271
10272 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010273 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010274 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010277 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010278 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010280 }
10281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010283 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10284 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 w = PyUnicode_New(
10288 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10289 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010291 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010292 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10293 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294 Py_DECREF(u);
10295 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010296 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010298
Benjamin Peterson29060642009-01-31 22:14:21 +000010299 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010300 Py_XDECREF(u);
10301 Py_XDECREF(v);
10302 return NULL;
10303}
10304
Victor Stinnerb0923652011-10-04 01:17:31 +020010305static void
10306unicode_append_inplace(PyObject **p_left, PyObject *right)
10307{
10308 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010309
10310 assert(PyUnicode_IS_READY(*p_left));
10311 assert(PyUnicode_IS_READY(right));
10312
10313 left_len = PyUnicode_GET_LENGTH(*p_left);
10314 right_len = PyUnicode_GET_LENGTH(right);
10315 if (left_len > PY_SSIZE_T_MAX - right_len) {
10316 PyErr_SetString(PyExc_OverflowError,
10317 "strings are too large to concat");
10318 goto error;
10319 }
10320 new_len = left_len + right_len;
10321
10322 /* Now we own the last reference to 'left', so we can resize it
10323 * in-place.
10324 */
10325 if (unicode_resize(p_left, new_len) != 0) {
10326 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10327 * deallocated so it cannot be put back into
10328 * 'variable'. The MemoryError is raised when there
10329 * is no value in 'variable', which might (very
10330 * remotely) be a cause of incompatibilities.
10331 */
10332 goto error;
10333 }
10334 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010335 copy_characters(*p_left, left_len, right, 0, right_len);
10336 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010337 return;
10338
10339error:
10340 Py_DECREF(*p_left);
10341 *p_left = NULL;
10342}
10343
Walter Dörwald1ab83302007-05-18 17:15:44 +000010344void
Victor Stinner23e56682011-10-03 03:54:37 +020010345PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010346{
Victor Stinner23e56682011-10-03 03:54:37 +020010347 PyObject *left, *res;
10348
10349 if (p_left == NULL) {
10350 if (!PyErr_Occurred())
10351 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010352 return;
10353 }
Victor Stinner23e56682011-10-03 03:54:37 +020010354 left = *p_left;
10355 if (right == NULL || !PyUnicode_Check(left)) {
10356 if (!PyErr_Occurred())
10357 PyErr_BadInternalCall();
10358 goto error;
10359 }
10360
Victor Stinnere1335c72011-10-04 20:53:03 +020010361 if (PyUnicode_READY(left))
10362 goto error;
10363 if (PyUnicode_READY(right))
10364 goto error;
10365
Victor Stinner23e56682011-10-03 03:54:37 +020010366 if (PyUnicode_CheckExact(left) && left != unicode_empty
10367 && PyUnicode_CheckExact(right) && right != unicode_empty
10368 && unicode_resizable(left)
10369 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10370 || _PyUnicode_WSTR(left) != NULL))
10371 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010372 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10373 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010374 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010375 not so different than duplicating the string. */
10376 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010377 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010378 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010379 if (p_left != NULL)
10380 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010381 return;
10382 }
10383 }
10384
10385 res = PyUnicode_Concat(left, right);
10386 if (res == NULL)
10387 goto error;
10388 Py_DECREF(left);
10389 *p_left = res;
10390 return;
10391
10392error:
10393 Py_DECREF(*p_left);
10394 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010395}
10396
10397void
10398PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010400 PyUnicode_Append(pleft, right);
10401 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010402}
10403
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010404PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010405 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010407Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010408string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010409interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010410
10411static PyObject *
10412unicode_count(PyUnicodeObject *self, PyObject *args)
10413{
10414 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010415 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010416 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010417 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 int kind1, kind2, kind;
10419 void *buf1, *buf2;
10420 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421
Jesus Ceaac451502011-04-20 17:09:23 +020010422 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10423 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010424 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 kind1 = PyUnicode_KIND(self);
10427 kind2 = PyUnicode_KIND(substring);
10428 kind = kind1 > kind2 ? kind1 : kind2;
10429 buf1 = PyUnicode_DATA(self);
10430 buf2 = PyUnicode_DATA(substring);
10431 if (kind1 != kind)
10432 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10433 if (!buf1) {
10434 Py_DECREF(substring);
10435 return NULL;
10436 }
10437 if (kind2 != kind)
10438 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10439 if (!buf2) {
10440 Py_DECREF(substring);
10441 if (kind1 != kind) PyMem_Free(buf1);
10442 return NULL;
10443 }
10444 len1 = PyUnicode_GET_LENGTH(self);
10445 len2 = PyUnicode_GET_LENGTH(substring);
10446
10447 ADJUST_INDICES(start, end, len1);
10448 switch(kind) {
10449 case PyUnicode_1BYTE_KIND:
10450 iresult = ucs1lib_count(
10451 ((Py_UCS1*)buf1) + start, end - start,
10452 buf2, len2, PY_SSIZE_T_MAX
10453 );
10454 break;
10455 case PyUnicode_2BYTE_KIND:
10456 iresult = ucs2lib_count(
10457 ((Py_UCS2*)buf1) + start, end - start,
10458 buf2, len2, PY_SSIZE_T_MAX
10459 );
10460 break;
10461 case PyUnicode_4BYTE_KIND:
10462 iresult = ucs4lib_count(
10463 ((Py_UCS4*)buf1) + start, end - start,
10464 buf2, len2, PY_SSIZE_T_MAX
10465 );
10466 break;
10467 default:
10468 assert(0); iresult = 0;
10469 }
10470
10471 result = PyLong_FromSsize_t(iresult);
10472
10473 if (kind1 != kind)
10474 PyMem_Free(buf1);
10475 if (kind2 != kind)
10476 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477
10478 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010479
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480 return result;
10481}
10482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010483PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010484 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010485\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010486Encode S using the codec registered for encoding. Default encoding\n\
10487is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010488handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010489a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10490'xmlcharrefreplace' as well as any other name registered with\n\
10491codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010494unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010496 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497 char *encoding = NULL;
10498 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010499
Benjamin Peterson308d6372009-09-18 21:42:35 +000010500 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10501 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010503 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010504}
10505
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010506PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010507 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508\n\
10509Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010510If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
10512static PyObject*
10513unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10514{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010515 Py_ssize_t i, j, line_pos, src_len, incr;
10516 Py_UCS4 ch;
10517 PyObject *u;
10518 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010520 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010521 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
10523 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525
Antoine Pitrou22425222011-10-04 19:10:51 +020010526 if (PyUnicode_READY(self) == -1)
10527 return NULL;
10528
Thomas Wouters7e474022000-07-16 12:04:32 +000010529 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010530 src_len = PyUnicode_GET_LENGTH(self);
10531 i = j = line_pos = 0;
10532 kind = PyUnicode_KIND(self);
10533 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010534 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010535 for (; i < src_len; i++) {
10536 ch = PyUnicode_READ(kind, src_data, i);
10537 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010538 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010539 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010540 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010541 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010542 goto overflow;
10543 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010544 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010545 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010546 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010548 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010549 goto overflow;
10550 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010552 if (ch == '\n' || ch == '\r')
10553 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010555 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010556 if (!found && PyUnicode_CheckExact(self)) {
10557 Py_INCREF((PyObject *) self);
10558 return (PyObject *) self;
10559 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010560
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010562 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563 if (!u)
10564 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010565 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010566
Antoine Pitroue71d5742011-10-04 15:55:09 +020010567 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568
Antoine Pitroue71d5742011-10-04 15:55:09 +020010569 for (; i < src_len; i++) {
10570 ch = PyUnicode_READ(kind, src_data, i);
10571 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010572 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010573 incr = tabsize - (line_pos % tabsize);
10574 line_pos += incr;
10575 while (incr--) {
10576 PyUnicode_WRITE(kind, dest_data, j, ' ');
10577 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010578 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010579 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010582 line_pos++;
10583 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010584 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010585 if (ch == '\n' || ch == '\r')
10586 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010588 }
10589 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010590#ifndef DONT_MAKE_RESULT_READY
10591 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 Py_DECREF(u);
10593 return NULL;
10594 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010595#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010596 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010598
Antoine Pitroue71d5742011-10-04 15:55:09 +020010599 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010600 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602}
10603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010604PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010605 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606\n\
10607Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010608such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010609arguments start and end are interpreted as in slice notation.\n\
10610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010611Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010612
10613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615{
Jesus Ceaac451502011-04-20 17:09:23 +020010616 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010617 Py_ssize_t start;
10618 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010619 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620
Jesus Ceaac451502011-04-20 17:09:23 +020010621 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10622 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010624
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 if (PyUnicode_READY(self) == -1)
10626 return NULL;
10627 if (PyUnicode_READY(substring) == -1)
10628 return NULL;
10629
Victor Stinner794d5672011-10-10 03:21:36 +020010630 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010632 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633
10634 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (result == -2)
10637 return NULL;
10638
Christian Heimes217cfd12007-12-02 14:31:20 +000010639 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010640}
10641
10642static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010643unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010644{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010645 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10646 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649}
10650
Guido van Rossumc2504932007-09-18 19:42:40 +000010651/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010652 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010653static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010654unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010655{
Guido van Rossumc2504932007-09-18 19:42:40 +000010656 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010657 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010659 if (_PyUnicode_HASH(self) != -1)
10660 return _PyUnicode_HASH(self);
10661 if (PyUnicode_READY(self) == -1)
10662 return -1;
10663 len = PyUnicode_GET_LENGTH(self);
10664
10665 /* The hash function as a macro, gets expanded three times below. */
10666#define HASH(P) \
10667 x = (Py_uhash_t)*P << 7; \
10668 while (--len >= 0) \
10669 x = (1000003*x) ^ (Py_uhash_t)*P++;
10670
10671 switch (PyUnicode_KIND(self)) {
10672 case PyUnicode_1BYTE_KIND: {
10673 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10674 HASH(c);
10675 break;
10676 }
10677 case PyUnicode_2BYTE_KIND: {
10678 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10679 HASH(s);
10680 break;
10681 }
10682 default: {
10683 Py_UCS4 *l;
10684 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10685 "Impossible switch case in unicode_hash");
10686 l = PyUnicode_4BYTE_DATA(self);
10687 HASH(l);
10688 break;
10689 }
10690 }
10691 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10692
Guido van Rossumc2504932007-09-18 19:42:40 +000010693 if (x == -1)
10694 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010696 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010697}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010699
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010700PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010701 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010702\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010703Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704
10705static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010707{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010708 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010709 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010710 Py_ssize_t start;
10711 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010712
Jesus Ceaac451502011-04-20 17:09:23 +020010713 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10714 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 if (PyUnicode_READY(self) == -1)
10718 return NULL;
10719 if (PyUnicode_READY(substring) == -1)
10720 return NULL;
10721
Victor Stinner794d5672011-10-10 03:21:36 +020010722 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725
10726 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010728 if (result == -2)
10729 return NULL;
10730
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 if (result < 0) {
10732 PyErr_SetString(PyExc_ValueError, "substring not found");
10733 return NULL;
10734 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010735
Christian Heimes217cfd12007-12-02 14:31:20 +000010736 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737}
10738
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010739PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010740 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010742Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010743at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010744
10745static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010746unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 Py_ssize_t i, length;
10749 int kind;
10750 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751 int cased;
10752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 if (PyUnicode_READY(self) == -1)
10754 return NULL;
10755 length = PyUnicode_GET_LENGTH(self);
10756 kind = PyUnicode_KIND(self);
10757 data = PyUnicode_DATA(self);
10758
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 if (length == 1)
10761 return PyBool_FromLong(
10762 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010764 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010766 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010767
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 for (i = 0; i < length; i++) {
10770 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010771
Benjamin Peterson29060642009-01-31 22:14:21 +000010772 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10773 return PyBool_FromLong(0);
10774 else if (!cased && Py_UNICODE_ISLOWER(ch))
10775 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010778}
10779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010780PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010781 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010783Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010784at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010785
10786static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010787unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789 Py_ssize_t i, length;
10790 int kind;
10791 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792 int cased;
10793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794 if (PyUnicode_READY(self) == -1)
10795 return NULL;
10796 length = PyUnicode_GET_LENGTH(self);
10797 kind = PyUnicode_KIND(self);
10798 data = PyUnicode_DATA(self);
10799
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (length == 1)
10802 return PyBool_FromLong(
10803 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010805 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010807 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010808
Guido van Rossumd57fd912000-03-10 22:53:23 +000010809 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010810 for (i = 0; i < length; i++) {
10811 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010812
Benjamin Peterson29060642009-01-31 22:14:21 +000010813 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10814 return PyBool_FromLong(0);
10815 else if (!cased && Py_UNICODE_ISUPPER(ch))
10816 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010818 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010819}
10820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010821PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010822 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010823\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010824Return True if S is a titlecased string and there is at least one\n\
10825character in S, i.e. upper- and titlecase characters may only\n\
10826follow uncased characters and lowercase characters only cased ones.\n\
10827Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828
10829static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010830unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010832 Py_ssize_t i, length;
10833 int kind;
10834 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010835 int cased, previous_is_cased;
10836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010837 if (PyUnicode_READY(self) == -1)
10838 return NULL;
10839 length = PyUnicode_GET_LENGTH(self);
10840 kind = PyUnicode_KIND(self);
10841 data = PyUnicode_DATA(self);
10842
Guido van Rossumd57fd912000-03-10 22:53:23 +000010843 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010844 if (length == 1) {
10845 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10846 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10847 (Py_UNICODE_ISUPPER(ch) != 0));
10848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010850 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010852 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010853
Guido van Rossumd57fd912000-03-10 22:53:23 +000010854 cased = 0;
10855 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856 for (i = 0; i < length; i++) {
10857 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010858
Benjamin Peterson29060642009-01-31 22:14:21 +000010859 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10860 if (previous_is_cased)
10861 return PyBool_FromLong(0);
10862 previous_is_cased = 1;
10863 cased = 1;
10864 }
10865 else if (Py_UNICODE_ISLOWER(ch)) {
10866 if (!previous_is_cased)
10867 return PyBool_FromLong(0);
10868 previous_is_cased = 1;
10869 cased = 1;
10870 }
10871 else
10872 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010874 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010875}
10876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010877PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010878 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010879\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010880Return True if all characters in S are whitespace\n\
10881and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010882
10883static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010884unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010885{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010886 Py_ssize_t i, length;
10887 int kind;
10888 void *data;
10889
10890 if (PyUnicode_READY(self) == -1)
10891 return NULL;
10892 length = PyUnicode_GET_LENGTH(self);
10893 kind = PyUnicode_KIND(self);
10894 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895
Guido van Rossumd57fd912000-03-10 22:53:23 +000010896 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 if (length == 1)
10898 return PyBool_FromLong(
10899 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010900
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010901 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010902 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010905 for (i = 0; i < length; i++) {
10906 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010907 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010910 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010911}
10912
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010913PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010914 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010915\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010916Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010917and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010918
10919static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010920unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 Py_ssize_t i, length;
10923 int kind;
10924 void *data;
10925
10926 if (PyUnicode_READY(self) == -1)
10927 return NULL;
10928 length = PyUnicode_GET_LENGTH(self);
10929 kind = PyUnicode_KIND(self);
10930 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010931
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (length == 1)
10934 return PyBool_FromLong(
10935 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010936
10937 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010939 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 for (i = 0; i < length; i++) {
10942 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010943 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010944 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010945 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010946}
10947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010948PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010949 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010950\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010951Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010952and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010953
10954static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010955unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 int kind;
10958 void *data;
10959 Py_ssize_t len, i;
10960
10961 if (PyUnicode_READY(self) == -1)
10962 return NULL;
10963
10964 kind = PyUnicode_KIND(self);
10965 data = PyUnicode_DATA(self);
10966 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010967
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010968 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 if (len == 1) {
10970 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10971 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10972 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010973
10974 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010976 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 for (i = 0; i < len; i++) {
10979 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010980 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010981 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010982 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010983 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010984}
10985
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010986PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010989Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010990False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
10992static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010993unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010994{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 Py_ssize_t i, length;
10996 int kind;
10997 void *data;
10998
10999 if (PyUnicode_READY(self) == -1)
11000 return NULL;
11001 length = PyUnicode_GET_LENGTH(self);
11002 kind = PyUnicode_KIND(self);
11003 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011004
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (length == 1)
11007 return PyBool_FromLong(
11008 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011010 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 for (i = 0; i < length; i++) {
11015 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011018 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011019}
11020
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011021PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011024Return True if all characters in S are digits\n\
11025and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026
11027static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011028unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 Py_ssize_t i, length;
11031 int kind;
11032 void *data;
11033
11034 if (PyUnicode_READY(self) == -1)
11035 return NULL;
11036 length = PyUnicode_GET_LENGTH(self);
11037 kind = PyUnicode_KIND(self);
11038 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011039
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (length == 1) {
11042 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11043 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11044 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011045
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011046 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011048 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011049
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 for (i = 0; i < length; i++) {
11051 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011052 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011054 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011055}
11056
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011057PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011058 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011059\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011060Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011061False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011062
11063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011064unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011066 Py_ssize_t i, length;
11067 int kind;
11068 void *data;
11069
11070 if (PyUnicode_READY(self) == -1)
11071 return NULL;
11072 length = PyUnicode_GET_LENGTH(self);
11073 kind = PyUnicode_KIND(self);
11074 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Guido van Rossumd57fd912000-03-10 22:53:23 +000011076 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 if (length == 1)
11078 return PyBool_FromLong(
11079 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011081 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011083 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 for (i = 0; i < length; i++) {
11086 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011087 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011089 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011090}
11091
Martin v. Löwis47383402007-08-15 07:32:56 +000011092int
11093PyUnicode_IsIdentifier(PyObject *self)
11094{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 int kind;
11096 void *data;
11097 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011098 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 if (PyUnicode_READY(self) == -1) {
11101 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011102 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 }
11104
11105 /* Special case for empty strings */
11106 if (PyUnicode_GET_LENGTH(self) == 0)
11107 return 0;
11108 kind = PyUnicode_KIND(self);
11109 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011110
11111 /* PEP 3131 says that the first character must be in
11112 XID_Start and subsequent characters in XID_Continue,
11113 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011114 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011115 letters, digits, underscore). However, given the current
11116 definition of XID_Start and XID_Continue, it is sufficient
11117 to check just for these, except that _ must be allowed
11118 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011120 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011121 return 0;
11122
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011123 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011125 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011126 return 1;
11127}
11128
11129PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011130 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011131\n\
11132Return True if S is a valid identifier according\n\
11133to the language definition.");
11134
11135static PyObject*
11136unicode_isidentifier(PyObject *self)
11137{
11138 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11139}
11140
Georg Brandl559e5d72008-06-11 18:37:52 +000011141PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011142 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011143\n\
11144Return True if all characters in S are considered\n\
11145printable in repr() or S is empty, False otherwise.");
11146
11147static PyObject*
11148unicode_isprintable(PyObject *self)
11149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 Py_ssize_t i, length;
11151 int kind;
11152 void *data;
11153
11154 if (PyUnicode_READY(self) == -1)
11155 return NULL;
11156 length = PyUnicode_GET_LENGTH(self);
11157 kind = PyUnicode_KIND(self);
11158 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011159
11160 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 if (length == 1)
11162 return PyBool_FromLong(
11163 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 for (i = 0; i < length; i++) {
11166 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011167 Py_RETURN_FALSE;
11168 }
11169 }
11170 Py_RETURN_TRUE;
11171}
11172
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011173PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011174 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011175\n\
11176Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011177iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011178
11179static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011180unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011181{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011182 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011183}
11184
Martin v. Löwis18e16552006-02-15 17:27:45 +000011185static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186unicode_length(PyUnicodeObject *self)
11187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011188 if (PyUnicode_READY(self) == -1)
11189 return -1;
11190 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191}
11192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011193PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011194 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011196Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011197done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198
11199static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011200unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011202 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 Py_UCS4 fillchar = ' ';
11204
11205 if (PyUnicode_READY(self) == -1)
11206 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011207
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011208 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209 return NULL;
11210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 Py_INCREF(self);
11213 return (PyObject*) self;
11214 }
11215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217}
11218
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011219PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011220 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223
11224static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011225unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011226{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011227 return fixup(self, fixlower);
11228}
11229
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011230#define LEFTSTRIP 0
11231#define RIGHTSTRIP 1
11232#define BOTHSTRIP 2
11233
11234/* Arrays indexed by above */
11235static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11236
11237#define STRIPNAME(i) (stripformat[i]+3)
11238
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011239/* externally visible for str.strip(unicode) */
11240PyObject *
11241_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011243 void *data;
11244 int kind;
11245 Py_ssize_t i, j, len;
11246 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11249 return NULL;
11250
11251 kind = PyUnicode_KIND(self);
11252 data = PyUnicode_DATA(self);
11253 len = PyUnicode_GET_LENGTH(self);
11254 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11255 PyUnicode_DATA(sepobj),
11256 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011257
Benjamin Peterson14339b62009-01-31 16:36:08 +000011258 i = 0;
11259 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 while (i < len &&
11261 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 i++;
11263 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011264 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011265
Benjamin Peterson14339b62009-01-31 16:36:08 +000011266 j = len;
11267 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 do {
11269 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 } while (j >= i &&
11271 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011272 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011273 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011274
Victor Stinner12bab6d2011-10-01 01:53:49 +020011275 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011276}
11277
11278PyObject*
11279PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11280{
11281 unsigned char *data;
11282 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011283 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284
Victor Stinnerde636f32011-10-01 03:55:54 +020011285 if (PyUnicode_READY(self) == -1)
11286 return NULL;
11287
11288 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11289
Victor Stinner12bab6d2011-10-01 01:53:49 +020011290 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011291 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011292 if (PyUnicode_CheckExact(self)) {
11293 Py_INCREF(self);
11294 return self;
11295 }
11296 else
11297 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 }
11299
Victor Stinner12bab6d2011-10-01 01:53:49 +020011300 length = end - start;
11301 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011302 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011303
Victor Stinnerde636f32011-10-01 03:55:54 +020011304 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011305 PyErr_SetString(PyExc_IndexError, "string index out of range");
11306 return NULL;
11307 }
11308
Victor Stinnerb9275c12011-10-05 14:01:42 +020011309 if (PyUnicode_IS_ASCII(self)) {
11310 kind = PyUnicode_KIND(self);
11311 data = PyUnicode_1BYTE_DATA(self);
11312 return unicode_fromascii(data + start, length);
11313 }
11314 else {
11315 kind = PyUnicode_KIND(self);
11316 data = PyUnicode_1BYTE_DATA(self);
11317 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011318 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011319 length);
11320 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011321}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
11323static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011324do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326 int kind;
11327 void *data;
11328 Py_ssize_t len, i, j;
11329
11330 if (PyUnicode_READY(self) == -1)
11331 return NULL;
11332
11333 kind = PyUnicode_KIND(self);
11334 data = PyUnicode_DATA(self);
11335 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011336
Benjamin Peterson14339b62009-01-31 16:36:08 +000011337 i = 0;
11338 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011340 i++;
11341 }
11342 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011343
Benjamin Peterson14339b62009-01-31 16:36:08 +000011344 j = len;
11345 if (striptype != LEFTSTRIP) {
11346 do {
11347 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011348 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011349 j++;
11350 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011351
Victor Stinner12bab6d2011-10-01 01:53:49 +020011352 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353}
11354
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011355
11356static PyObject *
11357do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11358{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011359 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011360
Benjamin Peterson14339b62009-01-31 16:36:08 +000011361 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11362 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011363
Benjamin Peterson14339b62009-01-31 16:36:08 +000011364 if (sep != NULL && sep != Py_None) {
11365 if (PyUnicode_Check(sep))
11366 return _PyUnicode_XStrip(self, striptype, sep);
11367 else {
11368 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "%s arg must be None or str",
11370 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011371 return NULL;
11372 }
11373 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011374
Benjamin Peterson14339b62009-01-31 16:36:08 +000011375 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011376}
11377
11378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011379PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011380 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011381\n\
11382Return a copy of the string S with leading and trailing\n\
11383whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011384If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011385
11386static PyObject *
11387unicode_strip(PyUnicodeObject *self, PyObject *args)
11388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011389 if (PyTuple_GET_SIZE(args) == 0)
11390 return do_strip(self, BOTHSTRIP); /* Common case */
11391 else
11392 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011393}
11394
11395
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011396PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011397 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011398\n\
11399Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011400If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011401
11402static PyObject *
11403unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11404{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011405 if (PyTuple_GET_SIZE(args) == 0)
11406 return do_strip(self, LEFTSTRIP); /* Common case */
11407 else
11408 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011409}
11410
11411
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011412PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011413 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011414\n\
11415Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011416If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011417
11418static PyObject *
11419unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11420{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011421 if (PyTuple_GET_SIZE(args) == 0)
11422 return do_strip(self, RIGHTSTRIP); /* Common case */
11423 else
11424 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011425}
11426
11427
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430{
11431 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
Georg Brandl222de0f2009-04-12 12:01:50 +000011434 if (len < 1) {
11435 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011436 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
Tim Peters7a29bd52001-09-12 03:03:31 +000011439 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 /* no repeat, return original string */
11441 Py_INCREF(str);
11442 return (PyObject*) str;
11443 }
Tim Peters8f422462000-09-09 06:13:41 +000011444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 if (PyUnicode_READY(str) == -1)
11446 return NULL;
11447
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011448 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011449 PyErr_SetString(PyExc_OverflowError,
11450 "repeated string is too long");
11451 return NULL;
11452 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456 if (!u)
11457 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011458 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 if (PyUnicode_GET_LENGTH(str) == 1) {
11461 const int kind = PyUnicode_KIND(str);
11462 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11463 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011464 if (kind == PyUnicode_1BYTE_KIND)
11465 memset(to, (unsigned char)fill_char, len);
11466 else {
11467 for (n = 0; n < len; ++n)
11468 PyUnicode_WRITE(kind, to, n, fill_char);
11469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 }
11471 else {
11472 /* number of characters copied this far */
11473 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011474 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011475 char *to = (char *) PyUnicode_DATA(u);
11476 Py_MEMCPY(to, PyUnicode_DATA(str),
11477 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011478 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011479 n = (done <= nchars-done) ? done : nchars-done;
11480 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011481 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011482 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011483 }
11484
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011485 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486 return (PyObject*) u;
11487}
11488
Alexander Belopolsky40018472011-02-26 01:02:56 +000011489PyObject *
11490PyUnicode_Replace(PyObject *obj,
11491 PyObject *subobj,
11492 PyObject *replobj,
11493 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494{
11495 PyObject *self;
11496 PyObject *str1;
11497 PyObject *str2;
11498 PyObject *result;
11499
11500 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011501 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011503 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011504 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011505 Py_DECREF(self);
11506 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507 }
11508 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011509 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011510 Py_DECREF(self);
11511 Py_DECREF(str1);
11512 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011515 Py_DECREF(self);
11516 Py_DECREF(str1);
11517 Py_DECREF(str2);
11518 return result;
11519}
11520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011521PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011522 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523\n\
11524Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011525old replaced by new. If the optional argument count is\n\
11526given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527
11528static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011531 PyObject *str1;
11532 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011533 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534 PyObject *result;
11535
Martin v. Löwis18e16552006-02-15 17:27:45 +000011536 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011538 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011539 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 str1 = PyUnicode_FromObject(str1);
11541 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11542 return NULL;
11543 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011544 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011545 Py_DECREF(str1);
11546 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011547 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011548
11549 result = replace(self, str1, str2, maxcount);
11550
11551 Py_DECREF(str1);
11552 Py_DECREF(str2);
11553 return result;
11554}
11555
Alexander Belopolsky40018472011-02-26 01:02:56 +000011556static PyObject *
11557unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011559 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 Py_ssize_t isize;
11561 Py_ssize_t osize, squote, dquote, i, o;
11562 Py_UCS4 max, quote;
11563 int ikind, okind;
11564 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011566 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011567 return NULL;
11568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 isize = PyUnicode_GET_LENGTH(unicode);
11570 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011571
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 /* Compute length of output, quote characters, and
11573 maximum character */
11574 osize = 2; /* quotes */
11575 max = 127;
11576 squote = dquote = 0;
11577 ikind = PyUnicode_KIND(unicode);
11578 for (i = 0; i < isize; i++) {
11579 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11580 switch (ch) {
11581 case '\'': squote++; osize++; break;
11582 case '"': dquote++; osize++; break;
11583 case '\\': case '\t': case '\r': case '\n':
11584 osize += 2; break;
11585 default:
11586 /* Fast-path ASCII */
11587 if (ch < ' ' || ch == 0x7f)
11588 osize += 4; /* \xHH */
11589 else if (ch < 0x7f)
11590 osize++;
11591 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11592 osize++;
11593 max = ch > max ? ch : max;
11594 }
11595 else if (ch < 0x100)
11596 osize += 4; /* \xHH */
11597 else if (ch < 0x10000)
11598 osize += 6; /* \uHHHH */
11599 else
11600 osize += 10; /* \uHHHHHHHH */
11601 }
11602 }
11603
11604 quote = '\'';
11605 if (squote) {
11606 if (dquote)
11607 /* Both squote and dquote present. Use squote,
11608 and escape them */
11609 osize += squote;
11610 else
11611 quote = '"';
11612 }
11613
11614 repr = PyUnicode_New(osize, max);
11615 if (repr == NULL)
11616 return NULL;
11617 okind = PyUnicode_KIND(repr);
11618 odata = PyUnicode_DATA(repr);
11619
11620 PyUnicode_WRITE(okind, odata, 0, quote);
11621 PyUnicode_WRITE(okind, odata, osize-1, quote);
11622
11623 for (i = 0, o = 1; i < isize; i++) {
11624 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011625
11626 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if ((ch == quote) || (ch == '\\')) {
11628 PyUnicode_WRITE(okind, odata, o++, '\\');
11629 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011630 continue;
11631 }
11632
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011634 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 PyUnicode_WRITE(okind, odata, o++, '\\');
11636 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011637 }
11638 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 PyUnicode_WRITE(okind, odata, o++, '\\');
11640 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011641 }
11642 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 PyUnicode_WRITE(okind, odata, o++, '\\');
11644 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011645 }
11646
11647 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011648 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 PyUnicode_WRITE(okind, odata, o++, '\\');
11650 PyUnicode_WRITE(okind, odata, o++, 'x');
11651 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11652 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011653 }
11654
Georg Brandl559e5d72008-06-11 18:37:52 +000011655 /* Copy ASCII characters as-is */
11656 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011658 }
11659
Benjamin Peterson29060642009-01-31 22:14:21 +000011660 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011661 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011662 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011663 (categories Z* and C* except ASCII space)
11664 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011666 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011667 if (ch <= 0xff) {
11668 PyUnicode_WRITE(okind, odata, o++, '\\');
11669 PyUnicode_WRITE(okind, odata, o++, 'x');
11670 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11671 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011672 }
11673 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011674 else if (ch >= 0x10000) {
11675 PyUnicode_WRITE(okind, odata, o++, '\\');
11676 PyUnicode_WRITE(okind, odata, o++, 'U');
11677 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11678 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11679 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11680 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11681 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11682 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11683 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11684 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011685 }
11686 /* Map 16-bit characters to '\uxxxx' */
11687 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011688 PyUnicode_WRITE(okind, odata, o++, '\\');
11689 PyUnicode_WRITE(okind, odata, o++, 'u');
11690 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11691 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11692 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11693 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011694 }
11695 }
11696 /* Copy characters as-is */
11697 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011699 }
11700 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011701 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020011703 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011704 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705}
11706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011707PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709\n\
11710Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011711such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712arguments start and end are interpreted as in slice notation.\n\
11713\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011714Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715
11716static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718{
Jesus Ceaac451502011-04-20 17:09:23 +020011719 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011720 Py_ssize_t start;
11721 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011722 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723
Jesus Ceaac451502011-04-20 17:09:23 +020011724 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11725 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011726 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 if (PyUnicode_READY(self) == -1)
11729 return NULL;
11730 if (PyUnicode_READY(substring) == -1)
11731 return NULL;
11732
Victor Stinner794d5672011-10-10 03:21:36 +020011733 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011735 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011736
11737 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 if (result == -2)
11740 return NULL;
11741
Christian Heimes217cfd12007-12-02 14:31:20 +000011742 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743}
11744
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011745PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011748Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
11750static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752{
Jesus Ceaac451502011-04-20 17:09:23 +020011753 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011754 Py_ssize_t start;
11755 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011756 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757
Jesus Ceaac451502011-04-20 17:09:23 +020011758 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11759 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (PyUnicode_READY(self) == -1)
11763 return NULL;
11764 if (PyUnicode_READY(substring) == -1)
11765 return NULL;
11766
Victor Stinner794d5672011-10-10 03:21:36 +020011767 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011768 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011769 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
11771 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011772
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 if (result == -2)
11774 return NULL;
11775
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 if (result < 0) {
11777 PyErr_SetString(PyExc_ValueError, "substring not found");
11778 return NULL;
11779 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011780
Christian Heimes217cfd12007-12-02 14:31:20 +000011781 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782}
11783
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011784PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011785 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011787Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011788done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
11790static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011791unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011793 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 Py_UCS4 fillchar = ' ';
11795
Victor Stinnere9a29352011-10-01 02:14:59 +020011796 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011798
Victor Stinnere9a29352011-10-01 02:14:59 +020011799 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 return NULL;
11801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 Py_INCREF(self);
11804 return (PyObject*) self;
11805 }
11806
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808}
11809
Alexander Belopolsky40018472011-02-26 01:02:56 +000011810PyObject *
11811PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812{
11813 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 s = PyUnicode_FromObject(s);
11816 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011818 if (sep != NULL) {
11819 sep = PyUnicode_FromObject(sep);
11820 if (sep == NULL) {
11821 Py_DECREF(s);
11822 return NULL;
11823 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 }
11825
Victor Stinner9310abb2011-10-05 00:59:23 +020011826 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
11828 Py_DECREF(s);
11829 Py_XDECREF(sep);
11830 return result;
11831}
11832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011833PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011835\n\
11836Return a list of the words in S, using sep as the\n\
11837delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011838splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011839whitespace string is a separator and empty strings are\n\
11840removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841
11842static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011843unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844{
11845 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011846 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847
Martin v. Löwis18e16552006-02-15 17:27:45 +000011848 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 return NULL;
11850
11851 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011854 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011856 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857}
11858
Thomas Wouters477c8d52006-05-27 19:21:47 +000011859PyObject *
11860PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11861{
11862 PyObject* str_obj;
11863 PyObject* sep_obj;
11864 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011865 int kind1, kind2, kind;
11866 void *buf1 = NULL, *buf2 = NULL;
11867 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011868
11869 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011870 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011871 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011872 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011874 Py_DECREF(str_obj);
11875 return NULL;
11876 }
11877
Victor Stinner14f8f022011-10-05 20:58:25 +020011878 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011879 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011880 kind = Py_MAX(kind1, kind2);
11881 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011883 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011884 if (!buf1)
11885 goto onError;
11886 buf2 = PyUnicode_DATA(sep_obj);
11887 if (kind2 != kind)
11888 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11889 if (!buf2)
11890 goto onError;
11891 len1 = PyUnicode_GET_LENGTH(str_obj);
11892 len2 = PyUnicode_GET_LENGTH(sep_obj);
11893
Victor Stinner14f8f022011-10-05 20:58:25 +020011894 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011896 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11897 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11898 else
11899 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 break;
11901 case PyUnicode_2BYTE_KIND:
11902 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11903 break;
11904 case PyUnicode_4BYTE_KIND:
11905 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11906 break;
11907 default:
11908 assert(0);
11909 out = 0;
11910 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011911
11912 Py_DECREF(sep_obj);
11913 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (kind1 != kind)
11915 PyMem_Free(buf1);
11916 if (kind2 != kind)
11917 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011918
11919 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 onError:
11921 Py_DECREF(sep_obj);
11922 Py_DECREF(str_obj);
11923 if (kind1 != kind && buf1)
11924 PyMem_Free(buf1);
11925 if (kind2 != kind && buf2)
11926 PyMem_Free(buf2);
11927 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011928}
11929
11930
11931PyObject *
11932PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11933{
11934 PyObject* str_obj;
11935 PyObject* sep_obj;
11936 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011937 int kind1, kind2, kind;
11938 void *buf1 = NULL, *buf2 = NULL;
11939 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011940
11941 str_obj = PyUnicode_FromObject(str_in);
11942 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011943 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011944 sep_obj = PyUnicode_FromObject(sep_in);
11945 if (!sep_obj) {
11946 Py_DECREF(str_obj);
11947 return NULL;
11948 }
11949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011950 kind1 = PyUnicode_KIND(str_in);
11951 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011952 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011953 buf1 = PyUnicode_DATA(str_in);
11954 if (kind1 != kind)
11955 buf1 = _PyUnicode_AsKind(str_in, kind);
11956 if (!buf1)
11957 goto onError;
11958 buf2 = PyUnicode_DATA(sep_obj);
11959 if (kind2 != kind)
11960 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11961 if (!buf2)
11962 goto onError;
11963 len1 = PyUnicode_GET_LENGTH(str_obj);
11964 len2 = PyUnicode_GET_LENGTH(sep_obj);
11965
11966 switch(PyUnicode_KIND(str_in)) {
11967 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011968 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11969 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11970 else
11971 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011972 break;
11973 case PyUnicode_2BYTE_KIND:
11974 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11975 break;
11976 case PyUnicode_4BYTE_KIND:
11977 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11978 break;
11979 default:
11980 assert(0);
11981 out = 0;
11982 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011983
11984 Py_DECREF(sep_obj);
11985 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 if (kind1 != kind)
11987 PyMem_Free(buf1);
11988 if (kind2 != kind)
11989 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011990
11991 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 onError:
11993 Py_DECREF(sep_obj);
11994 Py_DECREF(str_obj);
11995 if (kind1 != kind && buf1)
11996 PyMem_Free(buf1);
11997 if (kind2 != kind && buf2)
11998 PyMem_Free(buf2);
11999 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012000}
12001
12002PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012003 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012004\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012005Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012006the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012007found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012008
12009static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012010unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012011{
Victor Stinner9310abb2011-10-05 00:59:23 +020012012 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012013}
12014
12015PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012016 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012017\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012018Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012019the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012020separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012021
12022static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012023unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012024{
Victor Stinner9310abb2011-10-05 00:59:23 +020012025 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012026}
12027
Alexander Belopolsky40018472011-02-26 01:02:56 +000012028PyObject *
12029PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012030{
12031 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012032
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012033 s = PyUnicode_FromObject(s);
12034 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012035 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012036 if (sep != NULL) {
12037 sep = PyUnicode_FromObject(sep);
12038 if (sep == NULL) {
12039 Py_DECREF(s);
12040 return NULL;
12041 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012042 }
12043
Victor Stinner9310abb2011-10-05 00:59:23 +020012044 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012045
12046 Py_DECREF(s);
12047 Py_XDECREF(sep);
12048 return result;
12049}
12050
12051PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012052 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012053\n\
12054Return a list of the words in S, using sep as the\n\
12055delimiter string, starting at the end of the string and\n\
12056working to the front. If maxsplit is given, at most maxsplit\n\
12057splits are done. If sep is not specified, any whitespace string\n\
12058is a separator.");
12059
12060static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012061unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012062{
12063 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012064 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012065
Martin v. Löwis18e16552006-02-15 17:27:45 +000012066 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012067 return NULL;
12068
12069 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012070 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012071 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012072 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012073 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012074 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012075}
12076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012077PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012078 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079\n\
12080Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012081Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012082is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083
12084static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012085unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012087 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012088 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012090 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12091 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092 return NULL;
12093
Guido van Rossum86662912000-04-11 15:38:46 +000012094 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
12097static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012098PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099{
Walter Dörwald346737f2007-05-31 10:44:43 +000012100 if (PyUnicode_CheckExact(self)) {
12101 Py_INCREF(self);
12102 return self;
12103 } else
12104 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012105 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106}
12107
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012108PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012109 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110\n\
12111Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012112and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113
12114static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012115unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117 return fixup(self, fixswapcase);
12118}
12119
Georg Brandlceee0772007-11-27 23:48:05 +000012120PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012121 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012122\n\
12123Return a translation table usable for str.translate().\n\
12124If there is only one argument, it must be a dictionary mapping Unicode\n\
12125ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012126Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012127If there are two arguments, they must be strings of equal length, and\n\
12128in the resulting dictionary, each character in x will be mapped to the\n\
12129character at the same position in y. If there is a third argument, it\n\
12130must be a string, whose characters will be mapped to None in the result.");
12131
12132static PyObject*
12133unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12134{
12135 PyObject *x, *y = NULL, *z = NULL;
12136 PyObject *new = NULL, *key, *value;
12137 Py_ssize_t i = 0;
12138 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139
Georg Brandlceee0772007-11-27 23:48:05 +000012140 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12141 return NULL;
12142 new = PyDict_New();
12143 if (!new)
12144 return NULL;
12145 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 int x_kind, y_kind, z_kind;
12147 void *x_data, *y_data, *z_data;
12148
Georg Brandlceee0772007-11-27 23:48:05 +000012149 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012150 if (!PyUnicode_Check(x)) {
12151 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12152 "be a string if there is a second argument");
12153 goto err;
12154 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012155 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012156 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12157 "arguments must have equal length");
12158 goto err;
12159 }
12160 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 x_kind = PyUnicode_KIND(x);
12162 y_kind = PyUnicode_KIND(y);
12163 x_data = PyUnicode_DATA(x);
12164 y_data = PyUnicode_DATA(y);
12165 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12166 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12167 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012168 if (!key || !value)
12169 goto err;
12170 res = PyDict_SetItem(new, key, value);
12171 Py_DECREF(key);
12172 Py_DECREF(value);
12173 if (res < 0)
12174 goto err;
12175 }
12176 /* create entries for deleting chars in z */
12177 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 z_kind = PyUnicode_KIND(z);
12179 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012180 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012182 if (!key)
12183 goto err;
12184 res = PyDict_SetItem(new, key, Py_None);
12185 Py_DECREF(key);
12186 if (res < 0)
12187 goto err;
12188 }
12189 }
12190 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 int kind;
12192 void *data;
12193
Georg Brandlceee0772007-11-27 23:48:05 +000012194 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012195 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012196 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12197 "to maketrans it must be a dict");
12198 goto err;
12199 }
12200 /* copy entries into the new dict, converting string keys to int keys */
12201 while (PyDict_Next(x, &i, &key, &value)) {
12202 if (PyUnicode_Check(key)) {
12203 /* convert string keys to integer keys */
12204 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012205 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012206 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12207 "table must be of length 1");
12208 goto err;
12209 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 kind = PyUnicode_KIND(key);
12211 data = PyUnicode_DATA(key);
12212 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012213 if (!newkey)
12214 goto err;
12215 res = PyDict_SetItem(new, newkey, value);
12216 Py_DECREF(newkey);
12217 if (res < 0)
12218 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012219 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012220 /* just keep integer keys */
12221 if (PyDict_SetItem(new, key, value) < 0)
12222 goto err;
12223 } else {
12224 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12225 "be strings or integers");
12226 goto err;
12227 }
12228 }
12229 }
12230 return new;
12231 err:
12232 Py_DECREF(new);
12233 return NULL;
12234}
12235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012236PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012237 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238\n\
12239Return a copy of the string S, where all characters have been mapped\n\
12240through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012241Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012242Unmapped characters are left untouched. Characters mapped to None\n\
12243are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
12245static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012246unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012248 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012249}
12250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012251PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012252 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012254Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
12256static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012257unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 return fixup(self, fixupper);
12260}
12261
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012262PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012265Pad a numeric string S with zeros on the left, to fill a field\n\
12266of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
12268static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012269unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012271 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012272 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012273 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 int kind;
12275 void *data;
12276 Py_UCS4 chr;
12277
12278 if (PyUnicode_READY(self) == -1)
12279 return NULL;
12280
Martin v. Löwis18e16552006-02-15 17:27:45 +000012281 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282 return NULL;
12283
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012285 if (PyUnicode_CheckExact(self)) {
12286 Py_INCREF(self);
12287 return (PyObject*) self;
12288 }
12289 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012290 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 }
12292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012294
12295 u = pad(self, fill, 0, '0');
12296
Walter Dörwald068325e2002-04-15 13:36:47 +000012297 if (u == NULL)
12298 return NULL;
12299
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 kind = PyUnicode_KIND(u);
12301 data = PyUnicode_DATA(u);
12302 chr = PyUnicode_READ(kind, data, fill);
12303
12304 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012305 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012306 PyUnicode_WRITE(kind, data, 0, chr);
12307 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 }
12309
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012310 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311 return (PyObject*) u;
12312}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313
12314#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012315static PyObject *
12316unicode__decimal2ascii(PyObject *self)
12317{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012319}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320#endif
12321
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012322PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012323 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012324\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012325Return True if S starts with the specified prefix, False otherwise.\n\
12326With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012327With optional end, stop comparing S at that position.\n\
12328prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012329
12330static PyObject *
12331unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012333{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012334 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012335 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012336 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012337 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012338 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339
Jesus Ceaac451502011-04-20 17:09:23 +020012340 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012342 if (PyTuple_Check(subobj)) {
12343 Py_ssize_t i;
12344 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12345 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012347 if (substring == NULL)
12348 return NULL;
12349 result = tailmatch(self, substring, start, end, -1);
12350 Py_DECREF(substring);
12351 if (result) {
12352 Py_RETURN_TRUE;
12353 }
12354 }
12355 /* nothing matched */
12356 Py_RETURN_FALSE;
12357 }
12358 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012359 if (substring == NULL) {
12360 if (PyErr_ExceptionMatches(PyExc_TypeError))
12361 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12362 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012363 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012364 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012365 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012367 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012368}
12369
12370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012371PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012372 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012373\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012374Return True if S ends with the specified suffix, False otherwise.\n\
12375With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012376With optional end, stop comparing S at that position.\n\
12377suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
12379static PyObject *
12380unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012381 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012383 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012385 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012386 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012387 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
Jesus Ceaac451502011-04-20 17:09:23 +020012389 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012390 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012391 if (PyTuple_Check(subobj)) {
12392 Py_ssize_t i;
12393 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12394 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012395 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012396 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012397 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012398 result = tailmatch(self, substring, start, end, +1);
12399 Py_DECREF(substring);
12400 if (result) {
12401 Py_RETURN_TRUE;
12402 }
12403 }
12404 Py_RETURN_FALSE;
12405 }
12406 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012407 if (substring == NULL) {
12408 if (PyErr_ExceptionMatches(PyExc_TypeError))
12409 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12410 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012411 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012412 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012413 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012414 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012415 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416}
12417
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012418#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012419
12420PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012422\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012423Return a formatted version of S, using substitutions from args and kwargs.\n\
12424The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012425
Eric Smith27bbca62010-11-04 17:06:58 +000012426PyDoc_STRVAR(format_map__doc__,
12427 "S.format_map(mapping) -> str\n\
12428\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012429Return a formatted version of S, using substitutions from mapping.\n\
12430The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012431
Eric Smith4a7d76d2008-05-30 18:10:19 +000012432static PyObject *
12433unicode__format__(PyObject* self, PyObject* args)
12434{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012435 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012436
12437 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12438 return NULL;
12439
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012440 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012442 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012443}
12444
Eric Smith8c663262007-08-25 02:26:07 +000012445PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012446 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012447\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012448Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012449
12450static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012451unicode__sizeof__(PyUnicodeObject *v)
12452{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 Py_ssize_t size;
12454
12455 /* If it's a compact object, account for base structure +
12456 character data. */
12457 if (PyUnicode_IS_COMPACT_ASCII(v))
12458 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12459 else if (PyUnicode_IS_COMPACT(v))
12460 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012461 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462 else {
12463 /* If it is a two-block object, account for base object, and
12464 for character block if present. */
12465 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012466 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012468 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012469 }
12470 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012471 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012472 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012473 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012474 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012475 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476
12477 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012478}
12479
12480PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012482
12483static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012484unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012485{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012486 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (!copy)
12488 return NULL;
12489 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012490}
12491
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492static PyMethodDef unicode_methods[] = {
12493
12494 /* Order is according to common usage: often used methods should
12495 appear first, since lookup is done sequentially. */
12496
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012497 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012498 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12499 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012501 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12502 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12503 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12504 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12505 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12506 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12507 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012508 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012509 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12510 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12511 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012512 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012513 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12514 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12515 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012516 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012517 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012518 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012519 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012520 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12521 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12522 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12523 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12524 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12525 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12526 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12527 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12528 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12529 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12530 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12531 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12532 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12533 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012534 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012535 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012536 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012537 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012538 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012539 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012540 {"maketrans", (PyCFunction) unicode_maketrans,
12541 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012542 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012543#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012544 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545#endif
12546
12547#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012548 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012549 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550#endif
12551
Benjamin Peterson14339b62009-01-31 16:36:08 +000012552 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 {NULL, NULL}
12554};
12555
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012556static PyObject *
12557unicode_mod(PyObject *v, PyObject *w)
12558{
Brian Curtindfc80e32011-08-10 20:28:54 -050012559 if (!PyUnicode_Check(v))
12560 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012561 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012562}
12563
12564static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012565 0, /*nb_add*/
12566 0, /*nb_subtract*/
12567 0, /*nb_multiply*/
12568 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012569};
12570
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012572 (lenfunc) unicode_length, /* sq_length */
12573 PyUnicode_Concat, /* sq_concat */
12574 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12575 (ssizeargfunc) unicode_getitem, /* sq_item */
12576 0, /* sq_slice */
12577 0, /* sq_ass_item */
12578 0, /* sq_ass_slice */
12579 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580};
12581
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012582static PyObject*
12583unicode_subscript(PyUnicodeObject* self, PyObject* item)
12584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 if (PyUnicode_READY(self) == -1)
12586 return NULL;
12587
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012588 if (PyIndex_Check(item)) {
12589 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012590 if (i == -1 && PyErr_Occurred())
12591 return NULL;
12592 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012594 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012595 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012596 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012597 PyObject *result;
12598 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012599 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012600 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012603 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012604 return NULL;
12605 }
12606
12607 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608 return PyUnicode_New(0, 0);
12609 } else if (start == 0 && step == 1 &&
12610 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012611 PyUnicode_CheckExact(self)) {
12612 Py_INCREF(self);
12613 return (PyObject *)self;
12614 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012615 return PyUnicode_Substring((PyObject*)self,
12616 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012617 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012618 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012619 src_kind = PyUnicode_KIND(self);
12620 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012621 if (!PyUnicode_IS_ASCII(self)) {
12622 kind_limit = kind_maxchar_limit(src_kind);
12623 max_char = 0;
12624 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12625 ch = PyUnicode_READ(src_kind, src_data, cur);
12626 if (ch > max_char) {
12627 max_char = ch;
12628 if (max_char >= kind_limit)
12629 break;
12630 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012631 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012632 }
Victor Stinner55c99112011-10-13 01:17:06 +020012633 else
12634 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012635 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012636 if (result == NULL)
12637 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012638 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012639 dest_data = PyUnicode_DATA(result);
12640
12641 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012642 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12643 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012644 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012645 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012646 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012647 } else {
12648 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12649 return NULL;
12650 }
12651}
12652
12653static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012654 (lenfunc)unicode_length, /* mp_length */
12655 (binaryfunc)unicode_subscript, /* mp_subscript */
12656 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012657};
12658
Guido van Rossumd57fd912000-03-10 22:53:23 +000012659
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660/* Helpers for PyUnicode_Format() */
12661
12662static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012663getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012665 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012666 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 (*p_argidx)++;
12668 if (arglen < 0)
12669 return args;
12670 else
12671 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672 }
12673 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012674 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675 return NULL;
12676}
12677
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012678/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012680static PyObject *
12681formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012683 char *p;
12684 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012686
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687 x = PyFloat_AsDouble(v);
12688 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012689 return NULL;
12690
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012693
Eric Smith0923d1d2009-04-16 20:16:10 +000012694 p = PyOS_double_to_string(x, type, prec,
12695 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012696 if (p == NULL)
12697 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012699 PyMem_Free(p);
12700 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701}
12702
Tim Peters38fd5b62000-09-21 05:43:11 +000012703static PyObject*
12704formatlong(PyObject *val, int flags, int prec, int type)
12705{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012706 char *buf;
12707 int len;
12708 PyObject *str; /* temporary string object. */
12709 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012710
Benjamin Peterson14339b62009-01-31 16:36:08 +000012711 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12712 if (!str)
12713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012715 Py_DECREF(str);
12716 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012717}
12718
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012719static Py_UCS4
12720formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012721{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012722 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012723 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012725 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 goto onError;
12728 }
12729 else {
12730 /* Integer input truncated to a character */
12731 long x;
12732 x = PyLong_AsLong(v);
12733 if (x == -1 && PyErr_Occurred())
12734 goto onError;
12735
12736 if (x < 0 || x > 0x10ffff) {
12737 PyErr_SetString(PyExc_OverflowError,
12738 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012739 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012740 }
12741
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012742 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012743 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012744
Benjamin Peterson29060642009-01-31 22:14:21 +000012745 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012746 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012747 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012748 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749}
12750
Antoine Pitrou978b9d22011-10-07 12:35:48 +020012751static int
12752repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
12753{
12754 int r;
12755 assert(count > 0);
12756 assert(PyUnicode_Check(obj));
12757 if (count > 5) {
12758 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
12759 if (repeated == NULL)
12760 return -1;
12761 r = _PyAccu_Accumulate(acc, repeated);
12762 Py_DECREF(repeated);
12763 return r;
12764 }
12765 else {
12766 do {
12767 if (_PyAccu_Accumulate(acc, obj))
12768 return -1;
12769 } while (--count);
12770 return 0;
12771 }
12772}
12773
Alexander Belopolsky40018472011-02-26 01:02:56 +000012774PyObject *
12775PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 void *fmt;
12778 int fmtkind;
12779 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012781 int r;
12782 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012783 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012785 PyObject *temp = NULL;
12786 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012787 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012788 _PyAccu acc;
12789 static PyObject *plus, *minus, *blank, *zero, *percent;
12790
12791 if (!plus && !(plus = get_latin1_char('+')))
12792 return NULL;
12793 if (!minus && !(minus = get_latin1_char('-')))
12794 return NULL;
12795 if (!blank && !(blank = get_latin1_char(' ')))
12796 return NULL;
12797 if (!zero && !(zero = get_latin1_char('0')))
12798 return NULL;
12799 if (!percent && !(percent = get_latin1_char('%')))
12800 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000012801
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012803 PyErr_BadInternalCall();
12804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12807 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012808 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012809 if (_PyAccu_Init(&acc))
12810 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 fmt = PyUnicode_DATA(uformat);
12812 fmtkind = PyUnicode_KIND(uformat);
12813 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12814 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012817 arglen = PyTuple_Size(args);
12818 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012819 }
12820 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 arglen = -1;
12822 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012824 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012825 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827
12828 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012829 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012830 PyObject *nonfmt;
12831 Py_ssize_t nonfmtpos;
12832 nonfmtpos = fmtpos++;
12833 while (fmtcnt >= 0 &&
12834 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12835 fmtpos++;
12836 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012838 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
12839 if (nonfmt == NULL)
12840 goto onError;
12841 r = _PyAccu_Accumulate(&acc, nonfmt);
12842 Py_DECREF(nonfmt);
12843 if (r)
12844 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 }
12846 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012847 /* Got a format specifier */
12848 int flags = 0;
12849 Py_ssize_t width = -1;
12850 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012852 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000012853 int isnumok;
12854 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020012855 void *pbuf = NULL;
12856 Py_ssize_t pindex, len;
12857 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 fmtpos++;
12860 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12861 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 Py_ssize_t keylen;
12863 PyObject *key;
12864 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012865
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 if (dict == NULL) {
12867 PyErr_SetString(PyExc_TypeError,
12868 "format requires a mapping");
12869 goto onError;
12870 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012871 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 /* Skip over balanced parentheses */
12875 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012876 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012878 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012879 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012880 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012882 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 if (fmtcnt < 0 || pcount > 0) {
12884 PyErr_SetString(PyExc_ValueError,
12885 "incomplete format key");
12886 goto onError;
12887 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012888 key = PyUnicode_Substring((PyObject*)uformat,
12889 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 if (key == NULL)
12891 goto onError;
12892 if (args_owned) {
12893 Py_DECREF(args);
12894 args_owned = 0;
12895 }
12896 args = PyObject_GetItem(dict, key);
12897 Py_DECREF(key);
12898 if (args == NULL) {
12899 goto onError;
12900 }
12901 args_owned = 1;
12902 arglen = -1;
12903 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012904 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012905 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 case '-': flags |= F_LJUST; continue;
12908 case '+': flags |= F_SIGN; continue;
12909 case ' ': flags |= F_BLANK; continue;
12910 case '#': flags |= F_ALT; continue;
12911 case '0': flags |= F_ZERO; continue;
12912 }
12913 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012914 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012915 if (c == '*') {
12916 v = getnextarg(args, arglen, &argidx);
12917 if (v == NULL)
12918 goto onError;
12919 if (!PyLong_Check(v)) {
12920 PyErr_SetString(PyExc_TypeError,
12921 "* wants int");
12922 goto onError;
12923 }
12924 width = PyLong_AsLong(v);
12925 if (width == -1 && PyErr_Occurred())
12926 goto onError;
12927 if (width < 0) {
12928 flags |= F_LJUST;
12929 width = -width;
12930 }
12931 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012933 }
12934 else if (c >= '0' && c <= '9') {
12935 width = c - '0';
12936 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012938 if (c < '0' || c > '9')
12939 break;
12940 if ((width*10) / 10 != width) {
12941 PyErr_SetString(PyExc_ValueError,
12942 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012943 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012944 }
12945 width = width*10 + (c - '0');
12946 }
12947 }
12948 if (c == '.') {
12949 prec = 0;
12950 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012952 if (c == '*') {
12953 v = getnextarg(args, arglen, &argidx);
12954 if (v == NULL)
12955 goto onError;
12956 if (!PyLong_Check(v)) {
12957 PyErr_SetString(PyExc_TypeError,
12958 "* wants int");
12959 goto onError;
12960 }
12961 prec = PyLong_AsLong(v);
12962 if (prec == -1 && PyErr_Occurred())
12963 goto onError;
12964 if (prec < 0)
12965 prec = 0;
12966 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012968 }
12969 else if (c >= '0' && c <= '9') {
12970 prec = c - '0';
12971 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012973 if (c < '0' || c > '9')
12974 break;
12975 if ((prec*10) / 10 != prec) {
12976 PyErr_SetString(PyExc_ValueError,
12977 "prec too big");
12978 goto onError;
12979 }
12980 prec = prec*10 + (c - '0');
12981 }
12982 }
12983 } /* prec */
12984 if (fmtcnt >= 0) {
12985 if (c == 'h' || c == 'l' || c == 'L') {
12986 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 }
12989 }
12990 if (fmtcnt < 0) {
12991 PyErr_SetString(PyExc_ValueError,
12992 "incomplete format");
12993 goto onError;
12994 }
12995 if (c != '%') {
12996 v = getnextarg(args, arglen, &argidx);
12997 if (v == NULL)
12998 goto onError;
12999 }
13000 sign = 0;
13001 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013002 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 switch (c) {
13004
13005 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013006 _PyAccu_Accumulate(&acc, percent);
13007 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013008
13009 case 's':
13010 case 'r':
13011 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013012 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013013 temp = v;
13014 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013015 }
13016 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013017 if (c == 's')
13018 temp = PyObject_Str(v);
13019 else if (c == 'r')
13020 temp = PyObject_Repr(v);
13021 else
13022 temp = PyObject_ASCII(v);
13023 if (temp == NULL)
13024 goto onError;
13025 if (PyUnicode_Check(temp))
13026 /* nothing to do */;
13027 else {
13028 Py_DECREF(temp);
13029 PyErr_SetString(PyExc_TypeError,
13030 "%s argument has non-string str()");
13031 goto onError;
13032 }
13033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013034 if (PyUnicode_READY(temp) == -1) {
13035 Py_CLEAR(temp);
13036 goto onError;
13037 }
13038 pbuf = PyUnicode_DATA(temp);
13039 kind = PyUnicode_KIND(temp);
13040 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013041 if (prec >= 0 && len > prec)
13042 len = prec;
13043 break;
13044
13045 case 'i':
13046 case 'd':
13047 case 'u':
13048 case 'o':
13049 case 'x':
13050 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 isnumok = 0;
13052 if (PyNumber_Check(v)) {
13053 PyObject *iobj=NULL;
13054
13055 if (PyLong_Check(v)) {
13056 iobj = v;
13057 Py_INCREF(iobj);
13058 }
13059 else {
13060 iobj = PyNumber_Long(v);
13061 }
13062 if (iobj!=NULL) {
13063 if (PyLong_Check(iobj)) {
13064 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013065 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013066 Py_DECREF(iobj);
13067 if (!temp)
13068 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013069 if (PyUnicode_READY(temp) == -1) {
13070 Py_CLEAR(temp);
13071 goto onError;
13072 }
13073 pbuf = PyUnicode_DATA(temp);
13074 kind = PyUnicode_KIND(temp);
13075 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013076 sign = 1;
13077 }
13078 else {
13079 Py_DECREF(iobj);
13080 }
13081 }
13082 }
13083 if (!isnumok) {
13084 PyErr_Format(PyExc_TypeError,
13085 "%%%c format: a number is required, "
13086 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13087 goto onError;
13088 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013089 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013090 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013091 fillobj = zero;
13092 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013093 break;
13094
13095 case 'e':
13096 case 'E':
13097 case 'f':
13098 case 'F':
13099 case 'g':
13100 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013101 temp = formatfloat(v, flags, prec, c);
13102 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013103 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013104 if (PyUnicode_READY(temp) == -1) {
13105 Py_CLEAR(temp);
13106 goto onError;
13107 }
13108 pbuf = PyUnicode_DATA(temp);
13109 kind = PyUnicode_KIND(temp);
13110 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013111 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013112 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013114 fillobj = zero;
13115 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013116 break;
13117
13118 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013119 {
13120 Py_UCS4 ch = formatchar(v);
13121 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013123 temp = _PyUnicode_FromUCS4(&ch, 1);
13124 if (temp == NULL)
13125 goto onError;
13126 pbuf = PyUnicode_DATA(temp);
13127 kind = PyUnicode_KIND(temp);
13128 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013130 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013131
13132 default:
13133 PyErr_Format(PyExc_ValueError,
13134 "unsupported format character '%c' (0x%x) "
13135 "at index %zd",
13136 (31<=c && c<=126) ? (char)c : '?',
13137 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013139 goto onError;
13140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 /* pbuf is initialized here. */
13142 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013143 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013144 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13145 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013146 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013147 pindex++;
13148 }
13149 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13150 signobj = plus;
13151 len--;
13152 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013153 }
13154 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013155 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013156 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013157 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 else
13159 sign = 0;
13160 }
13161 if (width < len)
13162 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013164 if (fill != ' ') {
13165 assert(signobj != NULL);
13166 if (_PyAccu_Accumulate(&acc, signobj))
13167 goto onError;
13168 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013169 if (width > len)
13170 width--;
13171 }
13172 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013173 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013174 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013176 second = get_latin1_char(
13177 PyUnicode_READ(kind, pbuf, pindex + 1));
13178 pindex += 2;
13179 if (second == NULL ||
13180 _PyAccu_Accumulate(&acc, zero) ||
13181 _PyAccu_Accumulate(&acc, second))
13182 goto onError;
13183 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013185 width -= 2;
13186 if (width < 0)
13187 width = 0;
13188 len -= 2;
13189 }
13190 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013191 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013192 if (repeat_accumulate(&acc, fillobj, width - len))
13193 goto onError;
13194 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 }
13196 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013197 if (sign) {
13198 assert(signobj != NULL);
13199 if (_PyAccu_Accumulate(&acc, signobj))
13200 goto onError;
13201 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13204 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013205 second = get_latin1_char(
13206 PyUnicode_READ(kind, pbuf, pindex + 1));
13207 pindex += 2;
13208 if (second == NULL ||
13209 _PyAccu_Accumulate(&acc, zero) ||
13210 _PyAccu_Accumulate(&acc, second))
13211 goto onError;
13212 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013213 }
13214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013215 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013216 if (temp != NULL) {
13217 assert(pbuf == PyUnicode_DATA(temp));
13218 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013220 else {
13221 const char *p = (const char *) pbuf;
13222 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013223 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013224 v = PyUnicode_FromKindAndData(kind, p, len);
13225 }
13226 if (v == NULL)
13227 goto onError;
13228 r = _PyAccu_Accumulate(&acc, v);
13229 Py_DECREF(v);
13230 if (r)
13231 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013232 if (width > len && repeat_accumulate(&acc, blank, width - len))
13233 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013234 if (dict && (argidx < arglen) && c != '%') {
13235 PyErr_SetString(PyExc_TypeError,
13236 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013237 goto onError;
13238 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013239 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013240 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013241 } /* until end */
13242 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 PyErr_SetString(PyExc_TypeError,
13244 "not all arguments converted during string formatting");
13245 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013246 }
13247
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013248 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013249 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013250 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013251 }
13252 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013253 Py_XDECREF(temp);
13254 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255 return (PyObject *)result;
13256
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013258 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013259 Py_XDECREF(temp);
13260 Py_XDECREF(second);
13261 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013263 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264 }
13265 return NULL;
13266}
13267
Jeremy Hylton938ace62002-07-17 16:30:39 +000013268static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013269unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13270
Tim Peters6d6c1a32001-08-02 04:15:00 +000013271static PyObject *
13272unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13273{
Benjamin Peterson29060642009-01-31 22:14:21 +000013274 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013275 static char *kwlist[] = {"object", "encoding", "errors", 0};
13276 char *encoding = NULL;
13277 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013278
Benjamin Peterson14339b62009-01-31 16:36:08 +000013279 if (type != &PyUnicode_Type)
13280 return unicode_subtype_new(type, args, kwds);
13281 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013282 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013283 return NULL;
13284 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013286 if (encoding == NULL && errors == NULL)
13287 return PyObject_Str(x);
13288 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013289 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013290}
13291
Guido van Rossume023fe02001-08-30 03:12:59 +000013292static PyObject *
13293unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13294{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013295 PyUnicodeObject *unicode, *self;
13296 Py_ssize_t length, char_size;
13297 int share_wstr, share_utf8;
13298 unsigned int kind;
13299 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013300
Benjamin Peterson14339b62009-01-31 16:36:08 +000013301 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013302
13303 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13304 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013305 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013306 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013307 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013308 return NULL;
13309
13310 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13311 if (self == NULL) {
13312 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013313 return NULL;
13314 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013315 kind = PyUnicode_KIND(unicode);
13316 length = PyUnicode_GET_LENGTH(unicode);
13317
13318 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013319#ifdef Py_DEBUG
13320 _PyUnicode_HASH(self) = -1;
13321#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013322 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013323#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013324 _PyUnicode_STATE(self).interned = 0;
13325 _PyUnicode_STATE(self).kind = kind;
13326 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013327 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013328 _PyUnicode_STATE(self).ready = 1;
13329 _PyUnicode_WSTR(self) = NULL;
13330 _PyUnicode_UTF8_LENGTH(self) = 0;
13331 _PyUnicode_UTF8(self) = NULL;
13332 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013333 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013334
13335 share_utf8 = 0;
13336 share_wstr = 0;
13337 if (kind == PyUnicode_1BYTE_KIND) {
13338 char_size = 1;
13339 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13340 share_utf8 = 1;
13341 }
13342 else if (kind == PyUnicode_2BYTE_KIND) {
13343 char_size = 2;
13344 if (sizeof(wchar_t) == 2)
13345 share_wstr = 1;
13346 }
13347 else {
13348 assert(kind == PyUnicode_4BYTE_KIND);
13349 char_size = 4;
13350 if (sizeof(wchar_t) == 4)
13351 share_wstr = 1;
13352 }
13353
13354 /* Ensure we won't overflow the length. */
13355 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13356 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013357 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013358 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013359 data = PyObject_MALLOC((length + 1) * char_size);
13360 if (data == NULL) {
13361 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 goto onError;
13363 }
13364
Victor Stinnerc3c74152011-10-02 20:39:55 +020013365 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013366 if (share_utf8) {
13367 _PyUnicode_UTF8_LENGTH(self) = length;
13368 _PyUnicode_UTF8(self) = data;
13369 }
13370 if (share_wstr) {
13371 _PyUnicode_WSTR_LENGTH(self) = length;
13372 _PyUnicode_WSTR(self) = (wchar_t *)data;
13373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013375 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013376 kind * (length + 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013377 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013378 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013379#ifdef Py_DEBUG
13380 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13381#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013382 return (PyObject *)self;
13383
13384onError:
13385 Py_DECREF(unicode);
13386 Py_DECREF(self);
13387 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013388}
13389
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013390PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013392\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013393Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013394encoding defaults to the current default string encoding.\n\
13395errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013396
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013397static PyObject *unicode_iter(PyObject *seq);
13398
Guido van Rossumd57fd912000-03-10 22:53:23 +000013399PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013400 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013401 "str", /* tp_name */
13402 sizeof(PyUnicodeObject), /* tp_size */
13403 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013405 (destructor)unicode_dealloc, /* tp_dealloc */
13406 0, /* tp_print */
13407 0, /* tp_getattr */
13408 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013409 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013410 unicode_repr, /* tp_repr */
13411 &unicode_as_number, /* tp_as_number */
13412 &unicode_as_sequence, /* tp_as_sequence */
13413 &unicode_as_mapping, /* tp_as_mapping */
13414 (hashfunc) unicode_hash, /* tp_hash*/
13415 0, /* tp_call*/
13416 (reprfunc) unicode_str, /* tp_str */
13417 PyObject_GenericGetAttr, /* tp_getattro */
13418 0, /* tp_setattro */
13419 0, /* tp_as_buffer */
13420 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013422 unicode_doc, /* tp_doc */
13423 0, /* tp_traverse */
13424 0, /* tp_clear */
13425 PyUnicode_RichCompare, /* tp_richcompare */
13426 0, /* tp_weaklistoffset */
13427 unicode_iter, /* tp_iter */
13428 0, /* tp_iternext */
13429 unicode_methods, /* tp_methods */
13430 0, /* tp_members */
13431 0, /* tp_getset */
13432 &PyBaseObject_Type, /* tp_base */
13433 0, /* tp_dict */
13434 0, /* tp_descr_get */
13435 0, /* tp_descr_set */
13436 0, /* tp_dictoffset */
13437 0, /* tp_init */
13438 0, /* tp_alloc */
13439 unicode_new, /* tp_new */
13440 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441};
13442
13443/* Initialize the Unicode implementation */
13444
Thomas Wouters78890102000-07-22 19:25:51 +000013445void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013447 int i;
13448
Thomas Wouters477c8d52006-05-27 19:21:47 +000013449 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013451 0x000A, /* LINE FEED */
13452 0x000D, /* CARRIAGE RETURN */
13453 0x001C, /* FILE SEPARATOR */
13454 0x001D, /* GROUP SEPARATOR */
13455 0x001E, /* RECORD SEPARATOR */
13456 0x0085, /* NEXT LINE */
13457 0x2028, /* LINE SEPARATOR */
13458 0x2029, /* PARAGRAPH SEPARATOR */
13459 };
13460
Fred Drakee4315f52000-05-09 19:53:39 +000013461 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013462 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013463 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013464 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013466
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013467 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013469 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013471
13472 /* initialize the linebreak bloom filter */
13473 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013474 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013475 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013476
13477 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013478}
13479
13480/* Finalize the Unicode implementation */
13481
Christian Heimesa156e092008-02-16 07:38:31 +000013482int
13483PyUnicode_ClearFreeList(void)
13484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013486}
13487
Guido van Rossumd57fd912000-03-10 22:53:23 +000013488void
Thomas Wouters78890102000-07-22 19:25:51 +000013489_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013490{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013491 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013492
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013493 Py_XDECREF(unicode_empty);
13494 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013495
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013496 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013497 if (unicode_latin1[i]) {
13498 Py_DECREF(unicode_latin1[i]);
13499 unicode_latin1[i] = NULL;
13500 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013501 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013502 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013503 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013504}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013505
Walter Dörwald16807132007-05-25 13:52:07 +000013506void
13507PyUnicode_InternInPlace(PyObject **p)
13508{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013509 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13510 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013511#ifdef Py_DEBUG
13512 assert(s != NULL);
13513 assert(_PyUnicode_CHECK(s));
13514#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013515 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013516 return;
13517#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 /* If it's a subclass, we don't really know what putting
13519 it in the interned dict might do. */
13520 if (!PyUnicode_CheckExact(s))
13521 return;
13522 if (PyUnicode_CHECK_INTERNED(s))
13523 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013524 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013525 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013526 return;
13527 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013528 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013529 if (interned == NULL) {
13530 interned = PyDict_New();
13531 if (interned == NULL) {
13532 PyErr_Clear(); /* Don't leave an exception */
13533 return;
13534 }
13535 }
13536 /* It might be that the GetItem call fails even
13537 though the key is present in the dictionary,
13538 namely when this happens during a stack overflow. */
13539 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013540 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013541 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013542
Benjamin Peterson29060642009-01-31 22:14:21 +000013543 if (t) {
13544 Py_INCREF(t);
13545 Py_DECREF(*p);
13546 *p = t;
13547 return;
13548 }
Walter Dörwald16807132007-05-25 13:52:07 +000013549
Benjamin Peterson14339b62009-01-31 16:36:08 +000013550 PyThreadState_GET()->recursion_critical = 1;
13551 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13552 PyErr_Clear();
13553 PyThreadState_GET()->recursion_critical = 0;
13554 return;
13555 }
13556 PyThreadState_GET()->recursion_critical = 0;
13557 /* The two references in interned are not counted by refcnt.
13558 The deallocator will take care of this */
13559 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013560 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013561}
13562
13563void
13564PyUnicode_InternImmortal(PyObject **p)
13565{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013566 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13567
Benjamin Peterson14339b62009-01-31 16:36:08 +000013568 PyUnicode_InternInPlace(p);
13569 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013570 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013571 Py_INCREF(*p);
13572 }
Walter Dörwald16807132007-05-25 13:52:07 +000013573}
13574
13575PyObject *
13576PyUnicode_InternFromString(const char *cp)
13577{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013578 PyObject *s = PyUnicode_FromString(cp);
13579 if (s == NULL)
13580 return NULL;
13581 PyUnicode_InternInPlace(&s);
13582 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013583}
13584
Alexander Belopolsky40018472011-02-26 01:02:56 +000013585void
13586_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013587{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013588 PyObject *keys;
13589 PyUnicodeObject *s;
13590 Py_ssize_t i, n;
13591 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013592
Benjamin Peterson14339b62009-01-31 16:36:08 +000013593 if (interned == NULL || !PyDict_Check(interned))
13594 return;
13595 keys = PyDict_Keys(interned);
13596 if (keys == NULL || !PyList_Check(keys)) {
13597 PyErr_Clear();
13598 return;
13599 }
Walter Dörwald16807132007-05-25 13:52:07 +000013600
Benjamin Peterson14339b62009-01-31 16:36:08 +000013601 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13602 detector, interned unicode strings are not forcibly deallocated;
13603 rather, we give them their stolen references back, and then clear
13604 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013605
Benjamin Peterson14339b62009-01-31 16:36:08 +000013606 n = PyList_GET_SIZE(keys);
13607 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013609 for (i = 0; i < n; i++) {
13610 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013611 if (PyUnicode_READY(s) == -1) {
13612 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013615 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013616 case SSTATE_NOT_INTERNED:
13617 /* XXX Shouldn't happen */
13618 break;
13619 case SSTATE_INTERNED_IMMORTAL:
13620 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013621 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013622 break;
13623 case SSTATE_INTERNED_MORTAL:
13624 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013625 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013626 break;
13627 default:
13628 Py_FatalError("Inconsistent interned string state.");
13629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013630 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013631 }
13632 fprintf(stderr, "total size of all interned strings: "
13633 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13634 "mortal/immortal\n", mortal_size, immortal_size);
13635 Py_DECREF(keys);
13636 PyDict_Clear(interned);
13637 Py_DECREF(interned);
13638 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013639}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013640
13641
13642/********************* Unicode Iterator **************************/
13643
13644typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013645 PyObject_HEAD
13646 Py_ssize_t it_index;
13647 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013648} unicodeiterobject;
13649
13650static void
13651unicodeiter_dealloc(unicodeiterobject *it)
13652{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013653 _PyObject_GC_UNTRACK(it);
13654 Py_XDECREF(it->it_seq);
13655 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013656}
13657
13658static int
13659unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13660{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013661 Py_VISIT(it->it_seq);
13662 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013663}
13664
13665static PyObject *
13666unicodeiter_next(unicodeiterobject *it)
13667{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013668 PyUnicodeObject *seq;
13669 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013670
Benjamin Peterson14339b62009-01-31 16:36:08 +000013671 assert(it != NULL);
13672 seq = it->it_seq;
13673 if (seq == NULL)
13674 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013675 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013677 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13678 int kind = PyUnicode_KIND(seq);
13679 void *data = PyUnicode_DATA(seq);
13680 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13681 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013682 if (item != NULL)
13683 ++it->it_index;
13684 return item;
13685 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013686
Benjamin Peterson14339b62009-01-31 16:36:08 +000013687 Py_DECREF(seq);
13688 it->it_seq = NULL;
13689 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013690}
13691
13692static PyObject *
13693unicodeiter_len(unicodeiterobject *it)
13694{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013695 Py_ssize_t len = 0;
13696 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013697 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013698 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013699}
13700
13701PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13702
13703static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013704 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013706 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013707};
13708
13709PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013710 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13711 "str_iterator", /* tp_name */
13712 sizeof(unicodeiterobject), /* tp_basicsize */
13713 0, /* tp_itemsize */
13714 /* methods */
13715 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13716 0, /* tp_print */
13717 0, /* tp_getattr */
13718 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013719 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013720 0, /* tp_repr */
13721 0, /* tp_as_number */
13722 0, /* tp_as_sequence */
13723 0, /* tp_as_mapping */
13724 0, /* tp_hash */
13725 0, /* tp_call */
13726 0, /* tp_str */
13727 PyObject_GenericGetAttr, /* tp_getattro */
13728 0, /* tp_setattro */
13729 0, /* tp_as_buffer */
13730 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13731 0, /* tp_doc */
13732 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13733 0, /* tp_clear */
13734 0, /* tp_richcompare */
13735 0, /* tp_weaklistoffset */
13736 PyObject_SelfIter, /* tp_iter */
13737 (iternextfunc)unicodeiter_next, /* tp_iternext */
13738 unicodeiter_methods, /* tp_methods */
13739 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013740};
13741
13742static PyObject *
13743unicode_iter(PyObject *seq)
13744{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013745 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013746
Benjamin Peterson14339b62009-01-31 16:36:08 +000013747 if (!PyUnicode_Check(seq)) {
13748 PyErr_BadInternalCall();
13749 return NULL;
13750 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013751 if (PyUnicode_READY(seq) == -1)
13752 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013753 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13754 if (it == NULL)
13755 return NULL;
13756 it->it_index = 0;
13757 Py_INCREF(seq);
13758 it->it_seq = (PyUnicodeObject *)seq;
13759 _PyObject_GC_TRACK(it);
13760 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013761}
13762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013763#define UNIOP(x) Py_UNICODE_##x
13764#define UNIOP_t Py_UNICODE
13765#include "uniops.h"
13766#undef UNIOP
13767#undef UNIOP_t
13768#define UNIOP(x) Py_UCS4_##x
13769#define UNIOP_t Py_UCS4
13770#include "uniops.h"
13771#undef UNIOP
13772#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013773
Victor Stinner71133ff2010-09-01 23:43:53 +000013774Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013775PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013776{
13777 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020013778 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000013779 Py_ssize_t size;
13780
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 if (!PyUnicode_Check(unicode)) {
13782 PyErr_BadArgument();
13783 return NULL;
13784 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013785 u = PyUnicode_AsUnicode(object);
13786 if (u == NULL)
13787 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000013788 /* Ensure we won't overflow the size. */
13789 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13790 PyErr_NoMemory();
13791 return NULL;
13792 }
13793 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13794 size *= sizeof(Py_UNICODE);
13795 copy = PyMem_Malloc(size);
13796 if (copy == NULL) {
13797 PyErr_NoMemory();
13798 return NULL;
13799 }
Victor Stinner577db2c2011-10-11 22:12:48 +020013800 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000013801 return copy;
13802}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013803
Georg Brandl66c221e2010-10-14 07:04:07 +000013804/* A _string module, to export formatter_parser and formatter_field_name_split
13805 to the string.Formatter class implemented in Python. */
13806
13807static PyMethodDef _string_methods[] = {
13808 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13809 METH_O, PyDoc_STR("split the argument as a field name")},
13810 {"formatter_parser", (PyCFunction) formatter_parser,
13811 METH_O, PyDoc_STR("parse the argument as a format string")},
13812 {NULL, NULL}
13813};
13814
13815static struct PyModuleDef _string_module = {
13816 PyModuleDef_HEAD_INIT,
13817 "_string",
13818 PyDoc_STR("string helper module"),
13819 0,
13820 _string_methods,
13821 NULL,
13822 NULL,
13823 NULL,
13824 NULL
13825};
13826
13827PyMODINIT_FUNC
13828PyInit__string(void)
13829{
13830 return PyModule_Create(&_string_module);
13831}
13832
13833
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013834#ifdef __cplusplus
13835}
13836#endif