blob: ef2839a125a75da491cfb38f6400664d8e9cb976 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 to_type *_to = (to_type *) to; \
181 const from_type *_iter = (begin); \
182 const from_type *_end = (end); \
183 Py_ssize_t n = (_end) - (_iter); \
184 const from_type *_unrolled_end = \
185 _iter + (n & ~ (Py_ssize_t) 3); \
186 while (_iter < (_unrolled_end)) { \
187 _to[0] = (to_type) _iter[0]; \
188 _to[1] = (to_type) _iter[1]; \
189 _to[2] = (to_type) _iter[2]; \
190 _to[3] = (to_type) _iter[3]; \
191 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200192 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200193 while (_iter < (_end)) \
194 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200195 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200196
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200197/* The Unicode string has been modified: reset the hash */
198#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
208static PyObject *interned;
209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200211static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000212
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200213/* List of static strings. */
214static _Py_Identifier *static_strings;
215
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000216/* Single character Unicode strings in the Latin-1 range are being
217 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200218static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000219
Christian Heimes190d79e2008-01-30 11:58:22 +0000220/* Fast detection of the most frequent whitespace characters */
221const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000223/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000224/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000225/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000226/* case 0x000C: * FORM FEED */
227/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000228 0, 1, 1, 1, 1, 1, 0, 0,
229 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000230/* case 0x001C: * FILE SEPARATOR */
231/* case 0x001D: * GROUP SEPARATOR */
232/* case 0x001E: * RECORD SEPARATOR */
233/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000234 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000235/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000236 1, 0, 0, 0, 0, 0, 0, 0,
237 0, 0, 0, 0, 0, 0, 0, 0,
238 0, 0, 0, 0, 0, 0, 0, 0,
239 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000240
Benjamin Peterson14339b62009-01-31 16:36:08 +0000241 0, 0, 0, 0, 0, 0, 0, 0,
242 0, 0, 0, 0, 0, 0, 0, 0,
243 0, 0, 0, 0, 0, 0, 0, 0,
244 0, 0, 0, 0, 0, 0, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 0, 0, 0, 0, 0, 0, 0, 0,
247 0, 0, 0, 0, 0, 0, 0, 0,
248 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000249};
250
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200251/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200252static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200253static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200254static void copy_characters(
255 PyObject *to, Py_ssize_t to_start,
256 PyObject *from, Py_ssize_t from_start,
257 Py_ssize_t how_many);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200258#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200259static int unicode_is_singleton(PyObject *unicode);
Victor Stinnerc729b8e2011-10-06 02:36:59 +0200260#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200261
Alexander Belopolsky40018472011-02-26 01:02:56 +0000262static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200263unicode_fromascii(const unsigned char *s, Py_ssize_t size);
264static PyObject *
265_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
266static PyObject *
267_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
268static PyObject *
269_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
270
271static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000272unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000273 PyObject **errorHandler,const char *encoding, const char *reason,
274 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
275 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
276
Alexander Belopolsky40018472011-02-26 01:02:56 +0000277static void
278raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300279 const char *encoding,
280 const Py_UNICODE *unicode, Py_ssize_t size,
281 Py_ssize_t startpos, Py_ssize_t endpos,
282 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000283
Christian Heimes190d79e2008-01-30 11:58:22 +0000284/* Same for linebreaks */
285static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* 0x000B, * LINE TABULATION */
289/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* 0x001C, * FILE SEPARATOR */
294/* 0x001D, * GROUP SEPARATOR */
295/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000296 0, 0, 0, 0, 1, 1, 1, 0,
297 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000301
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000310};
311
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300312/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
313 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000314Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000315PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000316{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000317#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000318 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000319#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000320 /* This is actually an illegal character, so it should
321 not be passed to unichr. */
322 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000323#endif
324}
325
Victor Stinner910337b2011-10-03 03:20:16 +0200326#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200327int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200328/* FIXME: use PyObject* type for op */
329_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200330{
331 PyASCIIObject *ascii;
332 unsigned int kind;
333
334 assert(PyUnicode_Check(op));
335
336 ascii = (PyASCIIObject *)op;
337 kind = ascii->state.kind;
338
Victor Stinnera3b334d2011-10-03 13:53:37 +0200339 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200340 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200341 assert(ascii->state.ready == 1);
342 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200344 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200345 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200346
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 if (ascii->state.compact == 1) {
348 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200349 assert(kind == PyUnicode_1BYTE_KIND
350 || kind == PyUnicode_2BYTE_KIND
351 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200353 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200354 assert (compact->utf8 != data);
355 } else {
356 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
357
358 data = unicode->data.any;
359 if (kind == PyUnicode_WCHAR_KIND) {
360 assert(ascii->state.compact == 0);
361 assert(ascii->state.ascii == 0);
362 assert(ascii->state.ready == 0);
363 assert(ascii->wstr != NULL);
364 assert(data == NULL);
365 assert(compact->utf8 == NULL);
366 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
367 }
368 else {
369 assert(kind == PyUnicode_1BYTE_KIND
370 || kind == PyUnicode_2BYTE_KIND
371 || kind == PyUnicode_4BYTE_KIND);
372 assert(ascii->state.compact == 0);
373 assert(ascii->state.ready == 1);
374 assert(data != NULL);
375 if (ascii->state.ascii) {
376 assert (compact->utf8 == data);
377 assert (compact->utf8_length == ascii->length);
378 }
379 else
380 assert (compact->utf8 != data);
381 }
382 }
383 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200384 if (
385#if SIZEOF_WCHAR_T == 2
386 kind == PyUnicode_2BYTE_KIND
387#else
388 kind == PyUnicode_4BYTE_KIND
389#endif
390 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200391 {
392 assert(ascii->wstr == data);
393 assert(compact->wstr_length == ascii->length);
394 } else
395 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200396 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200397
398 if (compact->utf8 == NULL)
399 assert(compact->utf8_length == 0);
400 if (ascii->wstr == NULL)
401 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200402 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 /* check that the best kind is used */
404 if (check_content && kind != PyUnicode_WCHAR_KIND)
405 {
406 Py_ssize_t i;
407 Py_UCS4 maxchar = 0;
408 void *data = PyUnicode_DATA(ascii);
409 for (i=0; i < ascii->length; i++)
410 {
411 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
412 if (ch > maxchar)
413 maxchar = ch;
414 }
415 if (kind == PyUnicode_1BYTE_KIND) {
416 if (ascii->state.ascii == 0)
417 assert(maxchar >= 128);
418 else
419 assert(maxchar < 128);
420 }
421 else if (kind == PyUnicode_2BYTE_KIND)
422 assert(maxchar >= 0x100);
423 else
424 assert(maxchar >= 0x10000);
425 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200426 if (check_content && !unicode_is_singleton((PyObject*)ascii))
427 assert(ascii->hash == -1);
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400428 return 1;
429}
Victor Stinner910337b2011-10-03 03:20:16 +0200430#endif
431
Victor Stinner3a50e702011-10-18 21:21:00 +0200432#ifdef HAVE_MBCS
433static OSVERSIONINFOEX winver;
434#endif
435
Thomas Wouters477c8d52006-05-27 19:21:47 +0000436/* --- Bloom Filters ----------------------------------------------------- */
437
438/* stuff to implement simple "bloom filters" for Unicode characters.
439 to keep things simple, we use a single bitmask, using the least 5
440 bits from each unicode characters as the bit index. */
441
442/* the linebreak mask is set up by Unicode_Init below */
443
Antoine Pitrouf068f942010-01-13 14:19:12 +0000444#if LONG_BIT >= 128
445#define BLOOM_WIDTH 128
446#elif LONG_BIT >= 64
447#define BLOOM_WIDTH 64
448#elif LONG_BIT >= 32
449#define BLOOM_WIDTH 32
450#else
451#error "LONG_BIT is smaller than 32"
452#endif
453
Thomas Wouters477c8d52006-05-27 19:21:47 +0000454#define BLOOM_MASK unsigned long
455
456static BLOOM_MASK bloom_linebreak;
457
Antoine Pitrouf068f942010-01-13 14:19:12 +0000458#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
459#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000460
Benjamin Peterson29060642009-01-31 22:14:21 +0000461#define BLOOM_LINEBREAK(ch) \
462 ((ch) < 128U ? ascii_linebreak[(ch)] : \
463 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000464
Alexander Belopolsky40018472011-02-26 01:02:56 +0000465Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200466make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000467{
468 /* calculate simple bloom-style bitmask for a given unicode string */
469
Antoine Pitrouf068f942010-01-13 14:19:12 +0000470 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000471 Py_ssize_t i;
472
473 mask = 0;
474 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200475 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000476
477 return mask;
478}
479
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200480#define BLOOM_MEMBER(mask, chr, str) \
481 (BLOOM(mask, chr) \
482 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000483
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200484/* Compilation of templated routines */
485
486#include "stringlib/asciilib.h"
487#include "stringlib/fastsearch.h"
488#include "stringlib/partition.h"
489#include "stringlib/split.h"
490#include "stringlib/count.h"
491#include "stringlib/find.h"
492#include "stringlib/find_max_char.h"
493#include "stringlib/localeutil.h"
494#include "stringlib/undef.h"
495
496#include "stringlib/ucs1lib.h"
497#include "stringlib/fastsearch.h"
498#include "stringlib/partition.h"
499#include "stringlib/split.h"
500#include "stringlib/count.h"
501#include "stringlib/find.h"
502#include "stringlib/find_max_char.h"
503#include "stringlib/localeutil.h"
504#include "stringlib/undef.h"
505
506#include "stringlib/ucs2lib.h"
507#include "stringlib/fastsearch.h"
508#include "stringlib/partition.h"
509#include "stringlib/split.h"
510#include "stringlib/count.h"
511#include "stringlib/find.h"
512#include "stringlib/find_max_char.h"
513#include "stringlib/localeutil.h"
514#include "stringlib/undef.h"
515
516#include "stringlib/ucs4lib.h"
517#include "stringlib/fastsearch.h"
518#include "stringlib/partition.h"
519#include "stringlib/split.h"
520#include "stringlib/count.h"
521#include "stringlib/find.h"
522#include "stringlib/find_max_char.h"
523#include "stringlib/localeutil.h"
524#include "stringlib/undef.h"
525
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200526#include "stringlib/unicodedefs.h"
527#include "stringlib/fastsearch.h"
528#include "stringlib/count.h"
529#include "stringlib/find.h"
530
Guido van Rossumd57fd912000-03-10 22:53:23 +0000531/* --- Unicode Object ----------------------------------------------------- */
532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200534fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200535
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200536Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
537 Py_ssize_t size, Py_UCS4 ch,
538 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200539{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200540 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
541
542 switch (kind) {
543 case PyUnicode_1BYTE_KIND:
544 {
545 Py_UCS1 ch1 = (Py_UCS1) ch;
546 if (ch1 == ch)
547 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
548 else
549 return -1;
550 }
551 case PyUnicode_2BYTE_KIND:
552 {
553 Py_UCS2 ch2 = (Py_UCS2) ch;
554 if (ch2 == ch)
555 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
556 else
557 return -1;
558 }
559 case PyUnicode_4BYTE_KIND:
560 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
561 default:
562 assert(0);
563 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200565}
566
Victor Stinnerfe226c02011-10-03 03:52:20 +0200567static PyObject*
568resize_compact(PyObject *unicode, Py_ssize_t length)
569{
570 Py_ssize_t char_size;
571 Py_ssize_t struct_size;
572 Py_ssize_t new_size;
573 int share_wstr;
574
575 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200576 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200577 if (PyUnicode_IS_COMPACT_ASCII(unicode))
578 struct_size = sizeof(PyASCIIObject);
579 else
580 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200581 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200582
583 _Py_DEC_REFTOTAL;
584 _Py_ForgetReference(unicode);
585
586 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
587 PyErr_NoMemory();
588 return NULL;
589 }
590 new_size = (struct_size + (length + 1) * char_size);
591
592 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
593 if (unicode == NULL) {
594 PyObject_Del(unicode);
595 PyErr_NoMemory();
596 return NULL;
597 }
598 _Py_NewReference(unicode);
599 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200600 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200601 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200602 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
603 _PyUnicode_WSTR_LENGTH(unicode) = length;
604 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200605 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
606 length, 0);
607 return unicode;
608}
609
Alexander Belopolsky40018472011-02-26 01:02:56 +0000610static int
Victor Stinner95663112011-10-04 01:03:50 +0200611resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000612{
Victor Stinner95663112011-10-04 01:03:50 +0200613 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200614 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000616
Victor Stinner95663112011-10-04 01:03:50 +0200617 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618
619 if (PyUnicode_IS_READY(unicode)) {
620 Py_ssize_t char_size;
621 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200622 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200623 void *data;
624
625 data = _PyUnicode_DATA_ANY(unicode);
626 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200627 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200628 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
629 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200630 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
631 {
632 PyObject_DEL(_PyUnicode_UTF8(unicode));
633 _PyUnicode_UTF8(unicode) = NULL;
634 _PyUnicode_UTF8_LENGTH(unicode) = 0;
635 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200636
637 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
638 PyErr_NoMemory();
639 return -1;
640 }
641 new_size = (length + 1) * char_size;
642
643 data = (PyObject *)PyObject_REALLOC(data, new_size);
644 if (data == NULL) {
645 PyErr_NoMemory();
646 return -1;
647 }
648 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200649 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200650 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200651 _PyUnicode_WSTR_LENGTH(unicode) = length;
652 }
653 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200654 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200655 _PyUnicode_UTF8_LENGTH(unicode) = length;
656 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200657 _PyUnicode_LENGTH(unicode) = length;
658 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200659 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200660 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200661 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200663 }
Victor Stinner95663112011-10-04 01:03:50 +0200664 assert(_PyUnicode_WSTR(unicode) != NULL);
665
666 /* check for integer overflow */
667 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
668 PyErr_NoMemory();
669 return -1;
670 }
671 wstr = _PyUnicode_WSTR(unicode);
672 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
673 if (!wstr) {
674 PyErr_NoMemory();
675 return -1;
676 }
677 _PyUnicode_WSTR(unicode) = wstr;
678 _PyUnicode_WSTR(unicode)[length] = 0;
679 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200680 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681 return 0;
682}
683
Victor Stinnerfe226c02011-10-03 03:52:20 +0200684static PyObject*
685resize_copy(PyObject *unicode, Py_ssize_t length)
686{
687 Py_ssize_t copy_length;
688 if (PyUnicode_IS_COMPACT(unicode)) {
689 PyObject *copy;
690 assert(PyUnicode_IS_READY(unicode));
691
692 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
693 if (copy == NULL)
694 return NULL;
695
696 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200697 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200699 }
700 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200701 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200702 assert(_PyUnicode_WSTR(unicode) != NULL);
703 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200704 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200705 if (w == NULL)
706 return NULL;
707 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
708 copy_length = Py_MIN(copy_length, length);
709 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
710 copy_length);
711 return (PyObject*)w;
712 }
713}
714
Guido van Rossumd57fd912000-03-10 22:53:23 +0000715/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000716 Ux0000 terminated; some code (e.g. new_identifier)
717 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000718
719 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000720 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000721
722*/
723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200724#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200725static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200726#endif
727
Alexander Belopolsky40018472011-02-26 01:02:56 +0000728static PyUnicodeObject *
729_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730{
731 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200732 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000733
Thomas Wouters477c8d52006-05-27 19:21:47 +0000734 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 if (length == 0 && unicode_empty != NULL) {
736 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200737 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000738 }
739
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000740 /* Ensure we won't overflow the size. */
741 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
742 return (PyUnicodeObject *)PyErr_NoMemory();
743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200744 if (length < 0) {
745 PyErr_SetString(PyExc_SystemError,
746 "Negative size passed to _PyUnicode_New");
747 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748 }
749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200750#ifdef Py_DEBUG
751 ++unicode_old_new_calls;
752#endif
753
754 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
755 if (unicode == NULL)
756 return NULL;
757 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
758 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
759 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000760 PyErr_NoMemory();
761 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763
Jeremy Hyltond8082792003-09-16 19:41:39 +0000764 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000765 * the caller fails before initializing str -- unicode_resize()
766 * reads str[0], and the Keep-Alive optimization can keep memory
767 * allocated for str alive across a call to unicode_dealloc(unicode).
768 * We don't want unicode_resize to read uninitialized memory in
769 * that case.
770 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200771 _PyUnicode_WSTR(unicode)[0] = 0;
772 _PyUnicode_WSTR(unicode)[length] = 0;
773 _PyUnicode_WSTR_LENGTH(unicode) = length;
774 _PyUnicode_HASH(unicode) = -1;
775 _PyUnicode_STATE(unicode).interned = 0;
776 _PyUnicode_STATE(unicode).kind = 0;
777 _PyUnicode_STATE(unicode).compact = 0;
778 _PyUnicode_STATE(unicode).ready = 0;
779 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200780 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200781 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200782 _PyUnicode_UTF8(unicode) = NULL;
783 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner67072932011-10-18 22:10:14 +0200784 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000785 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000786
Benjamin Peterson29060642009-01-31 22:14:21 +0000787 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000788 /* XXX UNREF/NEWREF interface should be more symmetrical */
789 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000790 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000791 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000792 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793}
794
Victor Stinnerf42dc442011-10-02 23:33:16 +0200795static const char*
796unicode_kind_name(PyObject *unicode)
797{
Victor Stinner42dfd712011-10-03 14:41:45 +0200798 /* don't check consistency: unicode_kind_name() is called from
799 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200800 if (!PyUnicode_IS_COMPACT(unicode))
801 {
802 if (!PyUnicode_IS_READY(unicode))
803 return "wstr";
804 switch(PyUnicode_KIND(unicode))
805 {
806 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200807 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200808 return "legacy ascii";
809 else
810 return "legacy latin1";
811 case PyUnicode_2BYTE_KIND:
812 return "legacy UCS2";
813 case PyUnicode_4BYTE_KIND:
814 return "legacy UCS4";
815 default:
816 return "<legacy invalid kind>";
817 }
818 }
819 assert(PyUnicode_IS_READY(unicode));
820 switch(PyUnicode_KIND(unicode))
821 {
822 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200823 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200824 return "ascii";
825 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200826 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200827 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200828 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200829 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200830 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200831 default:
832 return "<invalid compact kind>";
833 }
834}
835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200836#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200837static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200838
839/* Functions wrapping macros for use in debugger */
840char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200841 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200842}
843
844void *_PyUnicode_compact_data(void *unicode) {
845 return _PyUnicode_COMPACT_DATA(unicode);
846}
847void *_PyUnicode_data(void *unicode){
848 printf("obj %p\n", unicode);
849 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
850 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
851 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
852 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
853 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
854 return PyUnicode_DATA(unicode);
855}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200856
857void
858_PyUnicode_Dump(PyObject *op)
859{
860 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200861 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
862 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
863 void *data;
864 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
865 if (ascii->state.compact)
866 data = (compact + 1);
867 else
868 data = unicode->data.any;
869 if (ascii->wstr == data)
870 printf("shared ");
871 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200872 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200873 printf(" (%zu), ", compact->wstr_length);
874 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
875 printf("shared ");
876 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200877 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200878 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200879}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880#endif
881
882PyObject *
883PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
884{
885 PyObject *obj;
886 PyCompactUnicodeObject *unicode;
887 void *data;
888 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200889 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200890 Py_ssize_t char_size;
891 Py_ssize_t struct_size;
892
893 /* Optimization for empty strings */
894 if (size == 0 && unicode_empty != NULL) {
895 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200896 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 }
898
899#ifdef Py_DEBUG
900 ++unicode_new_new_calls;
901#endif
902
Victor Stinner9e9d6892011-10-04 01:02:02 +0200903 is_ascii = 0;
904 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905 struct_size = sizeof(PyCompactUnicodeObject);
906 if (maxchar < 128) {
907 kind_state = PyUnicode_1BYTE_KIND;
908 char_size = 1;
909 is_ascii = 1;
910 struct_size = sizeof(PyASCIIObject);
911 }
912 else if (maxchar < 256) {
913 kind_state = PyUnicode_1BYTE_KIND;
914 char_size = 1;
915 }
916 else if (maxchar < 65536) {
917 kind_state = PyUnicode_2BYTE_KIND;
918 char_size = 2;
919 if (sizeof(wchar_t) == 2)
920 is_sharing = 1;
921 }
922 else {
923 kind_state = PyUnicode_4BYTE_KIND;
924 char_size = 4;
925 if (sizeof(wchar_t) == 4)
926 is_sharing = 1;
927 }
928
929 /* Ensure we won't overflow the size. */
930 if (size < 0) {
931 PyErr_SetString(PyExc_SystemError,
932 "Negative size passed to PyUnicode_New");
933 return NULL;
934 }
935 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
936 return PyErr_NoMemory();
937
938 /* Duplicated allocation code from _PyObject_New() instead of a call to
939 * PyObject_New() so we are able to allocate space for the object and
940 * it's data buffer.
941 */
942 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
943 if (obj == NULL)
944 return PyErr_NoMemory();
945 obj = PyObject_INIT(obj, &PyUnicode_Type);
946 if (obj == NULL)
947 return NULL;
948
949 unicode = (PyCompactUnicodeObject *)obj;
950 if (is_ascii)
951 data = ((PyASCIIObject*)obj) + 1;
952 else
953 data = unicode + 1;
954 _PyUnicode_LENGTH(unicode) = size;
955 _PyUnicode_HASH(unicode) = -1;
956 _PyUnicode_STATE(unicode).interned = 0;
957 _PyUnicode_STATE(unicode).kind = kind_state;
958 _PyUnicode_STATE(unicode).compact = 1;
959 _PyUnicode_STATE(unicode).ready = 1;
960 _PyUnicode_STATE(unicode).ascii = is_ascii;
961 if (is_ascii) {
962 ((char*)data)[size] = 0;
963 _PyUnicode_WSTR(unicode) = NULL;
964 }
965 else if (kind_state == PyUnicode_1BYTE_KIND) {
966 ((char*)data)[size] = 0;
967 _PyUnicode_WSTR(unicode) = NULL;
968 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200969 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200970 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200971 }
972 else {
973 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200975 if (kind_state == PyUnicode_2BYTE_KIND)
976 ((Py_UCS2*)data)[size] = 0;
977 else /* kind_state == PyUnicode_4BYTE_KIND */
978 ((Py_UCS4*)data)[size] = 0;
979 if (is_sharing) {
980 _PyUnicode_WSTR_LENGTH(unicode) = size;
981 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
982 }
983 else {
984 _PyUnicode_WSTR_LENGTH(unicode) = 0;
985 _PyUnicode_WSTR(unicode) = NULL;
986 }
987 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200988 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200989 return obj;
990}
991
992#if SIZEOF_WCHAR_T == 2
993/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
994 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200995 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996
997 This function assumes that unicode can hold one more code point than wstr
998 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200999static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001000unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1001 PyUnicodeObject *unicode)
1002{
1003 const wchar_t *iter;
1004 Py_UCS4 *ucs4_out;
1005
Victor Stinner910337b2011-10-03 03:20:16 +02001006 assert(unicode != NULL);
1007 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001008 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1009 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1010
1011 for (iter = begin; iter < end; ) {
1012 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1013 _PyUnicode_GET_LENGTH(unicode)));
1014 if (*iter >= 0xD800 && *iter <= 0xDBFF
1015 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1016 {
1017 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
1018 iter += 2;
1019 }
1020 else {
1021 *ucs4_out++ = *iter;
1022 iter++;
1023 }
1024 }
1025 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1026 _PyUnicode_GET_LENGTH(unicode)));
1027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028}
1029#endif
1030
Victor Stinnercd9950f2011-10-02 00:34:53 +02001031static int
1032_PyUnicode_Dirty(PyObject *unicode)
1033{
Victor Stinner910337b2011-10-03 03:20:16 +02001034 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001035 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001036 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001037 "Cannot modify a string having more than 1 reference");
1038 return -1;
1039 }
1040 _PyUnicode_DIRTY(unicode);
1041 return 0;
1042}
1043
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001044static int
1045_copy_characters(PyObject *to, Py_ssize_t to_start,
1046 PyObject *from, Py_ssize_t from_start,
1047 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001048{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001049 unsigned int from_kind, to_kind;
1050 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001051 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001053 assert(PyUnicode_Check(from));
1054 assert(PyUnicode_Check(to));
1055 assert(PyUnicode_IS_READY(from));
1056 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001058 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1059 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1060 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001061
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001062 if (how_many == 0)
1063 return 0;
1064
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001065 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001066 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001069
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001070#ifdef Py_DEBUG
1071 if (!check_maxchar
1072 && (from_kind > to_kind
1073 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001074 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001075 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1076 Py_UCS4 ch;
1077 Py_ssize_t i;
1078 for (i=0; i < how_many; i++) {
1079 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1080 assert(ch <= to_maxchar);
1081 }
1082 }
1083#endif
1084 fast = (from_kind == to_kind);
1085 if (check_maxchar
1086 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1087 {
1088 /* deny latin1 => ascii */
1089 fast = 0;
1090 }
1091
1092 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001093 Py_MEMCPY((char*)to_data + to_kind * to_start,
1094 (char*)from_data + from_kind * from_start,
1095 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001096 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001097 else if (from_kind == PyUnicode_1BYTE_KIND
1098 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001099 {
1100 _PyUnicode_CONVERT_BYTES(
1101 Py_UCS1, Py_UCS2,
1102 PyUnicode_1BYTE_DATA(from) + from_start,
1103 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1104 PyUnicode_2BYTE_DATA(to) + to_start
1105 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001106 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001107 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001108 && to_kind == PyUnicode_4BYTE_KIND)
1109 {
1110 _PyUnicode_CONVERT_BYTES(
1111 Py_UCS1, Py_UCS4,
1112 PyUnicode_1BYTE_DATA(from) + from_start,
1113 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1114 PyUnicode_4BYTE_DATA(to) + to_start
1115 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001116 }
1117 else if (from_kind == PyUnicode_2BYTE_KIND
1118 && to_kind == PyUnicode_4BYTE_KIND)
1119 {
1120 _PyUnicode_CONVERT_BYTES(
1121 Py_UCS2, Py_UCS4,
1122 PyUnicode_2BYTE_DATA(from) + from_start,
1123 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1124 PyUnicode_4BYTE_DATA(to) + to_start
1125 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001126 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001127 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001128 /* check if max_char(from substring) <= max_char(to) */
1129 if (from_kind > to_kind
1130 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001131 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001132 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001133 /* slow path to check for character overflow */
1134 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001135 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001136 Py_ssize_t i;
1137
Victor Stinner56c161a2011-10-06 02:47:11 +02001138#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001139 for (i=0; i < how_many; i++) {
1140 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001141 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001142 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1143 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001144#else
1145 if (!check_maxchar) {
1146 for (i=0; i < how_many; i++) {
1147 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1148 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1149 }
1150 }
1151 else {
1152 for (i=0; i < how_many; i++) {
1153 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1154 if (ch > to_maxchar)
1155 return 1;
1156 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1157 }
1158 }
1159#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001160 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001161 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001162 assert(0 && "inconsistent state");
1163 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001164 }
1165 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001166 return 0;
1167}
1168
1169static void
1170copy_characters(PyObject *to, Py_ssize_t to_start,
1171 PyObject *from, Py_ssize_t from_start,
1172 Py_ssize_t how_many)
1173{
1174 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1175}
1176
1177Py_ssize_t
1178PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1179 PyObject *from, Py_ssize_t from_start,
1180 Py_ssize_t how_many)
1181{
1182 int err;
1183
1184 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1185 PyErr_BadInternalCall();
1186 return -1;
1187 }
1188
1189 if (PyUnicode_READY(from))
1190 return -1;
1191 if (PyUnicode_READY(to))
1192 return -1;
1193
1194 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1195 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1196 PyErr_Format(PyExc_SystemError,
1197 "Cannot write %zi characters at %zi "
1198 "in a string of %zi characters",
1199 how_many, to_start, PyUnicode_GET_LENGTH(to));
1200 return -1;
1201 }
1202
1203 if (how_many == 0)
1204 return 0;
1205
1206 if (_PyUnicode_Dirty(to))
1207 return -1;
1208
1209 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1210 if (err) {
1211 PyErr_Format(PyExc_SystemError,
1212 "Cannot copy %s characters "
1213 "into a string of %s characters",
1214 unicode_kind_name(from),
1215 unicode_kind_name(to));
1216 return -1;
1217 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219}
1220
Victor Stinner17222162011-09-28 22:15:37 +02001221/* Find the maximum code point and count the number of surrogate pairs so a
1222 correct string length can be computed before converting a string to UCS4.
1223 This function counts single surrogates as a character and not as a pair.
1224
1225 Return 0 on success, or -1 on error. */
1226static int
1227find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1228 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229{
1230 const wchar_t *iter;
1231
Victor Stinnerc53be962011-10-02 21:33:54 +02001232 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 *num_surrogates = 0;
1234 *maxchar = 0;
1235
1236 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001237 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001238 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001239#if SIZEOF_WCHAR_T != 2
1240 if (*maxchar >= 0x10000)
1241 return 0;
1242#endif
1243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244#if SIZEOF_WCHAR_T == 2
1245 if (*iter >= 0xD800 && *iter <= 0xDBFF
1246 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1247 {
1248 Py_UCS4 surrogate_val;
1249 surrogate_val = (((iter[0] & 0x3FF)<<10)
1250 | (iter[1] & 0x3FF)) + 0x10000;
1251 ++(*num_surrogates);
1252 if (surrogate_val > *maxchar)
1253 *maxchar = surrogate_val;
1254 iter += 2;
1255 }
1256 else
1257 iter++;
1258#else
1259 iter++;
1260#endif
1261 }
1262 return 0;
1263}
1264
1265#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001266static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001267#endif
1268
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001269static int
1270unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001271{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001272 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001273 wchar_t *end;
1274 Py_UCS4 maxchar = 0;
1275 Py_ssize_t num_surrogates;
1276#if SIZEOF_WCHAR_T == 2
1277 Py_ssize_t length_wo_surrogates;
1278#endif
1279
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001280 assert(p_obj != NULL);
1281 unicode = (PyUnicodeObject *)*p_obj;
1282
Georg Brandl7597add2011-10-05 16:36:47 +02001283 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001284 strings were created using _PyObject_New() and where no canonical
1285 representation (the str field) has been set yet aka strings
1286 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001287 assert(_PyUnicode_CHECK(unicode));
1288 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001289 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001290 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001291 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001292 /* Actually, it should neither be interned nor be anything else: */
1293 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001294
1295#ifdef Py_DEBUG
1296 ++unicode_ready_calls;
1297#endif
1298
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001299#ifdef Py_DEBUG
1300 assert(!replace || Py_REFCNT(unicode) == 1);
1301#else
1302 if (replace && Py_REFCNT(unicode) != 1)
1303 replace = 0;
1304#endif
1305 if (replace) {
1306 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1307 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1308 /* Optimization for empty strings */
1309 if (len == 0) {
1310 Py_INCREF(unicode_empty);
1311 Py_DECREF(*p_obj);
1312 *p_obj = unicode_empty;
1313 return 0;
1314 }
1315 if (len == 1 && wstr[0] < 256) {
1316 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1317 if (latin1_char == NULL)
1318 return -1;
1319 Py_DECREF(*p_obj);
1320 *p_obj = latin1_char;
1321 return 0;
1322 }
1323 }
1324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001326 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001327 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001329
1330 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001331 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1332 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001333 PyErr_NoMemory();
1334 return -1;
1335 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001336 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001337 _PyUnicode_WSTR(unicode), end,
1338 PyUnicode_1BYTE_DATA(unicode));
1339 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1340 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1341 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1342 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001343 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001344 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001345 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001346 }
1347 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001348 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001349 _PyUnicode_UTF8(unicode) = NULL;
1350 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351 }
1352 PyObject_FREE(_PyUnicode_WSTR(unicode));
1353 _PyUnicode_WSTR(unicode) = NULL;
1354 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1355 }
1356 /* In this case we might have to convert down from 4-byte native
1357 wchar_t to 2-byte unicode. */
1358 else if (maxchar < 65536) {
1359 assert(num_surrogates == 0 &&
1360 "FindMaxCharAndNumSurrogatePairs() messed up");
1361
Victor Stinner506f5922011-09-28 22:34:18 +02001362#if SIZEOF_WCHAR_T == 2
1363 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001364 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001365 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1366 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1367 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001368 _PyUnicode_UTF8(unicode) = NULL;
1369 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001370#else
1371 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001372 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001373 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001375 PyErr_NoMemory();
1376 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 }
Victor Stinner506f5922011-09-28 22:34:18 +02001378 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1379 _PyUnicode_WSTR(unicode), end,
1380 PyUnicode_2BYTE_DATA(unicode));
1381 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1382 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1383 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001384 _PyUnicode_UTF8(unicode) = NULL;
1385 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001386 PyObject_FREE(_PyUnicode_WSTR(unicode));
1387 _PyUnicode_WSTR(unicode) = NULL;
1388 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1389#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 }
1391 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1392 else {
1393#if SIZEOF_WCHAR_T == 2
1394 /* in case the native representation is 2-bytes, we need to allocate a
1395 new normalized 4-byte version. */
1396 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001397 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1398 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 PyErr_NoMemory();
1400 return -1;
1401 }
1402 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1403 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001404 _PyUnicode_UTF8(unicode) = NULL;
1405 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001406 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1407 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001408 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 PyObject_FREE(_PyUnicode_WSTR(unicode));
1410 _PyUnicode_WSTR(unicode) = NULL;
1411 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1412#else
1413 assert(num_surrogates == 0);
1414
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001417 _PyUnicode_UTF8(unicode) = NULL;
1418 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1420#endif
1421 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1422 }
1423 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001424 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 return 0;
1426}
1427
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001428int
1429_PyUnicode_ReadyReplace(PyObject **op)
1430{
1431 return unicode_ready(op, 1);
1432}
1433
1434int
1435_PyUnicode_Ready(PyObject *op)
1436{
1437 return unicode_ready(&op, 0);
1438}
1439
Alexander Belopolsky40018472011-02-26 01:02:56 +00001440static void
1441unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001442{
Walter Dörwald16807132007-05-25 13:52:07 +00001443 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001444 case SSTATE_NOT_INTERNED:
1445 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001446
Benjamin Peterson29060642009-01-31 22:14:21 +00001447 case SSTATE_INTERNED_MORTAL:
1448 /* revive dead object temporarily for DelItem */
1449 Py_REFCNT(unicode) = 3;
1450 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1451 Py_FatalError(
1452 "deletion of interned string failed");
1453 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001454
Benjamin Peterson29060642009-01-31 22:14:21 +00001455 case SSTATE_INTERNED_IMMORTAL:
1456 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001457
Benjamin Peterson29060642009-01-31 22:14:21 +00001458 default:
1459 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001460 }
1461
Victor Stinner03490912011-10-03 23:45:12 +02001462 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001464 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466
1467 if (PyUnicode_IS_COMPACT(unicode)) {
1468 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
1470 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001471 if (_PyUnicode_DATA_ANY(unicode))
1472 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001473 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 }
1475}
1476
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001477#ifdef Py_DEBUG
1478static int
1479unicode_is_singleton(PyObject *unicode)
1480{
1481 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1482 if (unicode == unicode_empty)
1483 return 1;
1484 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1485 {
1486 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1487 if (ch < 256 && unicode_latin1[ch] == unicode)
1488 return 1;
1489 }
1490 return 0;
1491}
1492#endif
1493
Alexander Belopolsky40018472011-02-26 01:02:56 +00001494static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001495unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001496{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001497 if (Py_REFCNT(unicode) != 1)
1498 return 0;
1499 if (PyUnicode_CHECK_INTERNED(unicode))
1500 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001501#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001502 /* singleton refcount is greater than 1 */
1503 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001504#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001505 return 1;
1506}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001507
Victor Stinnerfe226c02011-10-03 03:52:20 +02001508static int
1509unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1510{
1511 PyObject *unicode;
1512 Py_ssize_t old_length;
1513
1514 assert(p_unicode != NULL);
1515 unicode = *p_unicode;
1516
1517 assert(unicode != NULL);
1518 assert(PyUnicode_Check(unicode));
1519 assert(0 <= length);
1520
Victor Stinner910337b2011-10-03 03:20:16 +02001521 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001522 old_length = PyUnicode_WSTR_LENGTH(unicode);
1523 else
1524 old_length = PyUnicode_GET_LENGTH(unicode);
1525 if (old_length == length)
1526 return 0;
1527
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (!unicode_resizable(unicode)) {
1529 PyObject *copy = resize_copy(unicode, length);
1530 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001531 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001532 Py_DECREF(*p_unicode);
1533 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001534 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001535 }
1536
Victor Stinnerfe226c02011-10-03 03:52:20 +02001537 if (PyUnicode_IS_COMPACT(unicode)) {
1538 *p_unicode = resize_compact(unicode, length);
1539 if (*p_unicode == NULL)
1540 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001541 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001542 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001543 }
1544 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001545}
1546
Alexander Belopolsky40018472011-02-26 01:02:56 +00001547int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001548PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001549{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001550 PyObject *unicode;
1551 if (p_unicode == NULL) {
1552 PyErr_BadInternalCall();
1553 return -1;
1554 }
1555 unicode = *p_unicode;
1556 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1557 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1558 {
1559 PyErr_BadInternalCall();
1560 return -1;
1561 }
1562 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001563}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565static PyObject*
1566get_latin1_char(unsigned char ch)
1567{
Victor Stinnera464fc12011-10-02 20:39:30 +02001568 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001570 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 if (!unicode)
1572 return NULL;
1573 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001574 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 unicode_latin1[ch] = unicode;
1576 }
1577 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001578 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001579}
1580
Alexander Belopolsky40018472011-02-26 01:02:56 +00001581PyObject *
1582PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583{
1584 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001585 Py_UCS4 maxchar = 0;
1586 Py_ssize_t num_surrogates;
1587
1588 if (u == NULL)
1589 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591 /* If the Unicode data is known at construction time, we can apply
1592 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001593
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 /* Optimization for empty strings */
1595 if (size == 0 && unicode_empty != NULL) {
1596 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001597 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001598 }
Tim Petersced69f82003-09-16 20:30:58 +00001599
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 /* Single character Unicode objects in the Latin-1 range are
1601 shared when using this constructor */
1602 if (size == 1 && *u < 256)
1603 return get_latin1_char((unsigned char)*u);
1604
1605 /* If not empty and not single character, copy the Unicode data
1606 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001607 if (find_maxchar_surrogates(u, u + size,
1608 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001609 return NULL;
1610
1611 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1612 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613 if (!unicode)
1614 return NULL;
1615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001616 switch (PyUnicode_KIND(unicode)) {
1617 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001618 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1620 break;
1621 case PyUnicode_2BYTE_KIND:
1622#if Py_UNICODE_SIZE == 2
1623 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1624#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001625 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001626 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1627#endif
1628 break;
1629 case PyUnicode_4BYTE_KIND:
1630#if SIZEOF_WCHAR_T == 2
1631 /* This is the only case which has to process surrogates, thus
1632 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001633 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001634#else
1635 assert(num_surrogates == 0);
1636 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1637#endif
1638 break;
1639 default:
1640 assert(0 && "Impossible state");
1641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001643 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 return (PyObject *)unicode;
1645}
1646
Alexander Belopolsky40018472011-02-26 01:02:56 +00001647PyObject *
1648PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001649{
1650 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001651
Benjamin Peterson14339b62009-01-31 16:36:08 +00001652 if (size < 0) {
1653 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001654 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001655 return NULL;
1656 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001657
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001658 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001659 some optimizations which share commonly used objects.
1660 Also, this means the input must be UTF-8, so fall back to the
1661 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001662 if (u != NULL) {
1663
Benjamin Peterson29060642009-01-31 22:14:21 +00001664 /* Optimization for empty strings */
1665 if (size == 0 && unicode_empty != NULL) {
1666 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001667 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001668 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001669
1670 /* Single characters are shared when using this constructor.
1671 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672 if (size == 1 && Py_CHARMASK(*u) < 128)
1673 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001674
1675 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001676 }
1677
Walter Dörwald55507312007-05-18 13:12:10 +00001678 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001679 if (!unicode)
1680 return NULL;
1681
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001682 return (PyObject *)unicode;
1683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685PyObject *
1686PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001687{
1688 size_t size = strlen(u);
1689 if (size > PY_SSIZE_T_MAX) {
1690 PyErr_SetString(PyExc_OverflowError, "input too long");
1691 return NULL;
1692 }
1693
1694 return PyUnicode_FromStringAndSize(u, size);
1695}
1696
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001697PyObject *
1698_PyUnicode_FromId(_Py_Identifier *id)
1699{
1700 if (!id->object) {
1701 id->object = PyUnicode_FromString(id->string);
1702 if (!id->object)
1703 return NULL;
1704 PyUnicode_InternInPlace(&id->object);
1705 assert(!id->next);
1706 id->next = static_strings;
1707 static_strings = id;
1708 }
1709 Py_INCREF(id->object);
1710 return id->object;
1711}
1712
1713void
1714_PyUnicode_ClearStaticStrings()
1715{
1716 _Py_Identifier *i;
1717 for (i = static_strings; i; i = i->next) {
1718 Py_DECREF(i->object);
1719 i->object = NULL;
1720 i->next = NULL;
1721 }
1722}
1723
Victor Stinnere57b1c02011-09-28 22:20:48 +02001724static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001725unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001726{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001727 PyObject *res;
1728#ifdef Py_DEBUG
1729 const unsigned char *p;
1730 const unsigned char *end = s + size;
1731 for (p=s; p < end; p++) {
1732 assert(*p < 128);
1733 }
1734#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001735 if (size == 1)
1736 return get_latin1_char(s[0]);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001737 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001738 if (!res)
1739 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001740 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001741 return res;
1742}
1743
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001744static Py_UCS4
1745kind_maxchar_limit(unsigned int kind)
1746{
1747 switch(kind) {
1748 case PyUnicode_1BYTE_KIND:
1749 return 0x80;
1750 case PyUnicode_2BYTE_KIND:
1751 return 0x100;
1752 case PyUnicode_4BYTE_KIND:
1753 return 0x10000;
1754 default:
1755 assert(0 && "invalid kind");
1756 return 0x10ffff;
1757 }
1758}
1759
Victor Stinner702c7342011-10-05 13:50:52 +02001760static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001761_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001762{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001764 unsigned char max_char = 127;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001765
1766 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001767 if (size == 1)
1768 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001769 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001770 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 if (!res)
1772 return NULL;
1773 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001774 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001775 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001776}
1777
Victor Stinnere57b1c02011-09-28 22:20:48 +02001778static PyObject*
1779_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780{
1781 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001782 Py_UCS2 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001783
1784 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001785 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001786 return get_latin1_char((unsigned char)u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001787 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001788 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 if (!res)
1790 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001791 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001793 else {
1794 _PyUnicode_CONVERT_BYTES(
1795 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1796 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001797 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 return res;
1799}
1800
Victor Stinnere57b1c02011-09-28 22:20:48 +02001801static PyObject*
1802_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001805 Py_UCS4 max_char = 0;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001806
1807 assert(size >= 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001808 if (size == 1 && u[0] < 256)
1809 return get_latin1_char(u[0]);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001810 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001811 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 if (!res)
1813 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001814 if (max_char < 256)
1815 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1816 PyUnicode_1BYTE_DATA(res));
1817 else if (max_char < 0x10000)
1818 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1819 PyUnicode_2BYTE_DATA(res));
1820 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001822 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 return res;
1824}
1825
1826PyObject*
1827PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1828{
1829 switch(kind) {
1830 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001831 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001833 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001834 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001835 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001836 default:
1837 assert(0 && "invalid kind");
1838 PyErr_SetString(PyExc_SystemError, "invalid kind");
1839 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841}
1842
Victor Stinner25a4b292011-10-06 12:31:55 +02001843/* Ensure that a string uses the most efficient storage, if it is not the
1844 case: create a new string with of the right kind. Write NULL into *p_unicode
1845 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001846static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001847unicode_adjust_maxchar(PyObject **p_unicode)
1848{
1849 PyObject *unicode, *copy;
1850 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001851 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001852 unsigned int kind;
1853
1854 assert(p_unicode != NULL);
1855 unicode = *p_unicode;
1856 assert(PyUnicode_IS_READY(unicode));
1857 if (PyUnicode_IS_ASCII(unicode))
1858 return;
1859
1860 len = PyUnicode_GET_LENGTH(unicode);
1861 kind = PyUnicode_KIND(unicode);
1862 if (kind == PyUnicode_1BYTE_KIND) {
1863 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001864 max_char = ucs1lib_find_max_char(u, u + len);
1865 if (max_char >= 128)
1866 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001867 }
1868 else if (kind == PyUnicode_2BYTE_KIND) {
1869 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001870 max_char = ucs2lib_find_max_char(u, u + len);
1871 if (max_char >= 256)
1872 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001873 }
1874 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001875 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001876 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001877 max_char = ucs4lib_find_max_char(u, u + len);
1878 if (max_char >= 0x10000)
1879 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001880 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001881 copy = PyUnicode_New(len, max_char);
1882 copy_characters(copy, 0, unicode, 0, len);
1883 Py_DECREF(unicode);
1884 *p_unicode = copy;
1885}
1886
Victor Stinner034f6cf2011-09-30 02:26:44 +02001887PyObject*
1888PyUnicode_Copy(PyObject *unicode)
1889{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001890 Py_ssize_t size;
1891 PyObject *copy;
1892 void *data;
1893
Victor Stinner034f6cf2011-09-30 02:26:44 +02001894 if (!PyUnicode_Check(unicode)) {
1895 PyErr_BadInternalCall();
1896 return NULL;
1897 }
1898 if (PyUnicode_READY(unicode))
1899 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001900
1901 size = PyUnicode_GET_LENGTH(unicode);
1902 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1903 if (!copy)
1904 return NULL;
1905 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1906
1907 data = PyUnicode_DATA(unicode);
1908 switch (PyUnicode_KIND(unicode))
1909 {
1910 case PyUnicode_1BYTE_KIND:
1911 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1912 break;
1913 case PyUnicode_2BYTE_KIND:
1914 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1915 break;
1916 case PyUnicode_4BYTE_KIND:
1917 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1918 break;
1919 default:
1920 assert(0);
1921 break;
1922 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001923 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001924 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001925}
1926
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927
Victor Stinnerbc603d12011-10-02 01:00:40 +02001928/* Widen Unicode objects to larger buffers. Don't write terminating null
1929 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
1931void*
1932_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1933{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001934 Py_ssize_t len;
1935 void *result;
1936 unsigned int skind;
1937
1938 if (PyUnicode_READY(s))
1939 return NULL;
1940
1941 len = PyUnicode_GET_LENGTH(s);
1942 skind = PyUnicode_KIND(s);
1943 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001944 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001945 return NULL;
1946 }
1947 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001948 case PyUnicode_2BYTE_KIND:
1949 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1950 if (!result)
1951 return PyErr_NoMemory();
1952 assert(skind == PyUnicode_1BYTE_KIND);
1953 _PyUnicode_CONVERT_BYTES(
1954 Py_UCS1, Py_UCS2,
1955 PyUnicode_1BYTE_DATA(s),
1956 PyUnicode_1BYTE_DATA(s) + len,
1957 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001958 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001959 case PyUnicode_4BYTE_KIND:
1960 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1961 if (!result)
1962 return PyErr_NoMemory();
1963 if (skind == PyUnicode_2BYTE_KIND) {
1964 _PyUnicode_CONVERT_BYTES(
1965 Py_UCS2, Py_UCS4,
1966 PyUnicode_2BYTE_DATA(s),
1967 PyUnicode_2BYTE_DATA(s) + len,
1968 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001970 else {
1971 assert(skind == PyUnicode_1BYTE_KIND);
1972 _PyUnicode_CONVERT_BYTES(
1973 Py_UCS1, Py_UCS4,
1974 PyUnicode_1BYTE_DATA(s),
1975 PyUnicode_1BYTE_DATA(s) + len,
1976 result);
1977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001978 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001979 default:
1980 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 }
Victor Stinner01698042011-10-04 00:04:26 +02001982 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return NULL;
1984}
1985
1986static Py_UCS4*
1987as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1988 int copy_null)
1989{
1990 int kind;
1991 void *data;
1992 Py_ssize_t len, targetlen;
1993 if (PyUnicode_READY(string) == -1)
1994 return NULL;
1995 kind = PyUnicode_KIND(string);
1996 data = PyUnicode_DATA(string);
1997 len = PyUnicode_GET_LENGTH(string);
1998 targetlen = len;
1999 if (copy_null)
2000 targetlen++;
2001 if (!target) {
2002 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2003 PyErr_NoMemory();
2004 return NULL;
2005 }
2006 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2007 if (!target) {
2008 PyErr_NoMemory();
2009 return NULL;
2010 }
2011 }
2012 else {
2013 if (targetsize < targetlen) {
2014 PyErr_Format(PyExc_SystemError,
2015 "string is longer than the buffer");
2016 if (copy_null && 0 < targetsize)
2017 target[0] = 0;
2018 return NULL;
2019 }
2020 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002021 if (kind == PyUnicode_1BYTE_KIND) {
2022 Py_UCS1 *start = (Py_UCS1 *) data;
2023 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002024 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002025 else if (kind == PyUnicode_2BYTE_KIND) {
2026 Py_UCS2 *start = (Py_UCS2 *) data;
2027 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2028 }
2029 else {
2030 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 if (copy_null)
2034 target[len] = 0;
2035 return target;
2036}
2037
2038Py_UCS4*
2039PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2040 int copy_null)
2041{
2042 if (target == NULL || targetsize < 1) {
2043 PyErr_BadInternalCall();
2044 return NULL;
2045 }
2046 return as_ucs4(string, target, targetsize, copy_null);
2047}
2048
2049Py_UCS4*
2050PyUnicode_AsUCS4Copy(PyObject *string)
2051{
2052 return as_ucs4(string, NULL, 0, 1);
2053}
2054
2055#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002056
Alexander Belopolsky40018472011-02-26 01:02:56 +00002057PyObject *
2058PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00002061 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00002063 PyErr_BadInternalCall();
2064 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 }
2066
Martin v. Löwis790465f2008-04-05 20:41:37 +00002067 if (size == -1) {
2068 size = wcslen(w);
2069 }
2070
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002071 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072}
2073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002074#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002075
Walter Dörwald346737f2007-05-31 10:44:43 +00002076static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002077makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2078 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002079{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002080 *fmt++ = '%';
2081 if (width) {
2082 if (zeropad)
2083 *fmt++ = '0';
2084 fmt += sprintf(fmt, "%d", width);
2085 }
2086 if (precision)
2087 fmt += sprintf(fmt, ".%d", precision);
2088 if (longflag)
2089 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002090 else if (longlongflag) {
2091 /* longlongflag should only ever be nonzero on machines with
2092 HAVE_LONG_LONG defined */
2093#ifdef HAVE_LONG_LONG
2094 char *f = PY_FORMAT_LONG_LONG;
2095 while (*f)
2096 *fmt++ = *f++;
2097#else
2098 /* we shouldn't ever get here */
2099 assert(0);
2100 *fmt++ = 'l';
2101#endif
2102 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002103 else if (size_tflag) {
2104 char *f = PY_FORMAT_SIZE_T;
2105 while (*f)
2106 *fmt++ = *f++;
2107 }
2108 *fmt++ = c;
2109 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002110}
2111
Victor Stinner96865452011-03-01 23:44:09 +00002112/* helper for PyUnicode_FromFormatV() */
2113
2114static const char*
2115parse_format_flags(const char *f,
2116 int *p_width, int *p_precision,
2117 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2118{
2119 int width, precision, longflag, longlongflag, size_tflag;
2120
2121 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2122 f++;
2123 width = 0;
2124 while (Py_ISDIGIT((unsigned)*f))
2125 width = (width*10) + *f++ - '0';
2126 precision = 0;
2127 if (*f == '.') {
2128 f++;
2129 while (Py_ISDIGIT((unsigned)*f))
2130 precision = (precision*10) + *f++ - '0';
2131 if (*f == '%') {
2132 /* "%.3%s" => f points to "3" */
2133 f--;
2134 }
2135 }
2136 if (*f == '\0') {
2137 /* bogus format "%.1" => go backward, f points to "1" */
2138 f--;
2139 }
2140 if (p_width != NULL)
2141 *p_width = width;
2142 if (p_precision != NULL)
2143 *p_precision = precision;
2144
2145 /* Handle %ld, %lu, %lld and %llu. */
2146 longflag = 0;
2147 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002148 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002149
2150 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002151 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002152 longflag = 1;
2153 ++f;
2154 }
2155#ifdef HAVE_LONG_LONG
2156 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002157 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002158 longlongflag = 1;
2159 f += 2;
2160 }
2161#endif
2162 }
2163 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002164 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002165 size_tflag = 1;
2166 ++f;
2167 }
2168 if (p_longflag != NULL)
2169 *p_longflag = longflag;
2170 if (p_longlongflag != NULL)
2171 *p_longlongflag = longlongflag;
2172 if (p_size_tflag != NULL)
2173 *p_size_tflag = size_tflag;
2174 return f;
2175}
2176
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002177/* maximum number of characters required for output of %ld. 21 characters
2178 allows for 64-bit integers (in decimal) and an optional sign. */
2179#define MAX_LONG_CHARS 21
2180/* maximum number of characters required for output of %lld.
2181 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2182 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2183#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2184
Walter Dörwaldd2034312007-05-18 16:29:38 +00002185PyObject *
2186PyUnicode_FromFormatV(const char *format, va_list vargs)
2187{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002188 va_list count;
2189 Py_ssize_t callcount = 0;
2190 PyObject **callresults = NULL;
2191 PyObject **callresult = NULL;
2192 Py_ssize_t n = 0;
2193 int width = 0;
2194 int precision = 0;
2195 int zeropad;
2196 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002197 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002198 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002199 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2201 Py_UCS4 argmaxchar;
2202 Py_ssize_t numbersize = 0;
2203 char *numberresults = NULL;
2204 char *numberresult = NULL;
2205 Py_ssize_t i;
2206 int kind;
2207 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002208
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002209 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002210 /* step 1: count the number of %S/%R/%A/%s format specifications
2211 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2212 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002213 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002214 * also estimate a upper bound for all the number formats in the string,
2215 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 for (f = format; *f; f++) {
2218 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002219 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2221 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2222 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2223 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002226#ifdef HAVE_LONG_LONG
2227 if (longlongflag) {
2228 if (width < MAX_LONG_LONG_CHARS)
2229 width = MAX_LONG_LONG_CHARS;
2230 }
2231 else
2232#endif
2233 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2234 including sign. Decimal takes the most space. This
2235 isn't enough for octal. If a width is specified we
2236 need more (which we allocate later). */
2237 if (width < MAX_LONG_CHARS)
2238 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002239
2240 /* account for the size + '\0' to separate numbers
2241 inside of the numberresults buffer */
2242 numbersize += (width + 1);
2243 }
2244 }
2245 else if ((unsigned char)*f > 127) {
2246 PyErr_Format(PyExc_ValueError,
2247 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2248 "string, got a non-ASCII byte: 0x%02x",
2249 (unsigned char)*f);
2250 return NULL;
2251 }
2252 }
2253 /* step 2: allocate memory for the results of
2254 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2255 if (callcount) {
2256 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2257 if (!callresults) {
2258 PyErr_NoMemory();
2259 return NULL;
2260 }
2261 callresult = callresults;
2262 }
2263 /* step 2.5: allocate memory for the results of formating numbers */
2264 if (numbersize) {
2265 numberresults = PyObject_Malloc(numbersize);
2266 if (!numberresults) {
2267 PyErr_NoMemory();
2268 goto fail;
2269 }
2270 numberresult = numberresults;
2271 }
2272
2273 /* step 3: format numbers and figure out how large a buffer we need */
2274 for (f = format; *f; f++) {
2275 if (*f == '%') {
2276 const char* p;
2277 int longflag;
2278 int longlongflag;
2279 int size_tflag;
2280 int numprinted;
2281
2282 p = f;
2283 zeropad = (f[1] == '0');
2284 f = parse_format_flags(f, &width, &precision,
2285 &longflag, &longlongflag, &size_tflag);
2286 switch (*f) {
2287 case 'c':
2288 {
2289 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002290 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 n++;
2292 break;
2293 }
2294 case '%':
2295 n++;
2296 break;
2297 case 'i':
2298 case 'd':
2299 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2300 width, precision, *f);
2301 if (longflag)
2302 numprinted = sprintf(numberresult, fmt,
2303 va_arg(count, long));
2304#ifdef HAVE_LONG_LONG
2305 else if (longlongflag)
2306 numprinted = sprintf(numberresult, fmt,
2307 va_arg(count, PY_LONG_LONG));
2308#endif
2309 else if (size_tflag)
2310 numprinted = sprintf(numberresult, fmt,
2311 va_arg(count, Py_ssize_t));
2312 else
2313 numprinted = sprintf(numberresult, fmt,
2314 va_arg(count, int));
2315 n += numprinted;
2316 /* advance by +1 to skip over the '\0' */
2317 numberresult += (numprinted + 1);
2318 assert(*(numberresult - 1) == '\0');
2319 assert(*(numberresult - 2) != '\0');
2320 assert(numprinted >= 0);
2321 assert(numberresult <= numberresults + numbersize);
2322 break;
2323 case 'u':
2324 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2325 width, precision, 'u');
2326 if (longflag)
2327 numprinted = sprintf(numberresult, fmt,
2328 va_arg(count, unsigned long));
2329#ifdef HAVE_LONG_LONG
2330 else if (longlongflag)
2331 numprinted = sprintf(numberresult, fmt,
2332 va_arg(count, unsigned PY_LONG_LONG));
2333#endif
2334 else if (size_tflag)
2335 numprinted = sprintf(numberresult, fmt,
2336 va_arg(count, size_t));
2337 else
2338 numprinted = sprintf(numberresult, fmt,
2339 va_arg(count, unsigned int));
2340 n += numprinted;
2341 numberresult += (numprinted + 1);
2342 assert(*(numberresult - 1) == '\0');
2343 assert(*(numberresult - 2) != '\0');
2344 assert(numprinted >= 0);
2345 assert(numberresult <= numberresults + numbersize);
2346 break;
2347 case 'x':
2348 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2349 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2350 n += numprinted;
2351 numberresult += (numprinted + 1);
2352 assert(*(numberresult - 1) == '\0');
2353 assert(*(numberresult - 2) != '\0');
2354 assert(numprinted >= 0);
2355 assert(numberresult <= numberresults + numbersize);
2356 break;
2357 case 'p':
2358 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2359 /* %p is ill-defined: ensure leading 0x. */
2360 if (numberresult[1] == 'X')
2361 numberresult[1] = 'x';
2362 else if (numberresult[1] != 'x') {
2363 memmove(numberresult + 2, numberresult,
2364 strlen(numberresult) + 1);
2365 numberresult[0] = '0';
2366 numberresult[1] = 'x';
2367 numprinted += 2;
2368 }
2369 n += numprinted;
2370 numberresult += (numprinted + 1);
2371 assert(*(numberresult - 1) == '\0');
2372 assert(*(numberresult - 2) != '\0');
2373 assert(numprinted >= 0);
2374 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002375 break;
2376 case 's':
2377 {
2378 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002379 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002380 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2381 if (!str)
2382 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002383 /* since PyUnicode_DecodeUTF8 returns already flexible
2384 unicode objects, there is no need to call ready on them */
2385 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002386 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002388 /* Remember the str and switch to the next slot */
2389 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002390 break;
2391 }
2392 case 'U':
2393 {
2394 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002395 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002396 if (PyUnicode_READY(obj) == -1)
2397 goto fail;
2398 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002399 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002400 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002401 break;
2402 }
2403 case 'V':
2404 {
2405 PyObject *obj = va_arg(count, PyObject *);
2406 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002407 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002408 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002409 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002410 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002411 if (PyUnicode_READY(obj) == -1)
2412 goto fail;
2413 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002414 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002416 *callresult++ = NULL;
2417 }
2418 else {
2419 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2420 if (!str_obj)
2421 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002422 if (PyUnicode_READY(str_obj)) {
2423 Py_DECREF(str_obj);
2424 goto fail;
2425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002427 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002429 *callresult++ = str_obj;
2430 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002431 break;
2432 }
2433 case 'S':
2434 {
2435 PyObject *obj = va_arg(count, PyObject *);
2436 PyObject *str;
2437 assert(obj);
2438 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002439 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002442 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002444 /* Remember the str and switch to the next slot */
2445 *callresult++ = str;
2446 break;
2447 }
2448 case 'R':
2449 {
2450 PyObject *obj = va_arg(count, PyObject *);
2451 PyObject *repr;
2452 assert(obj);
2453 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002454 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002455 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002457 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002459 /* Remember the repr and switch to the next slot */
2460 *callresult++ = repr;
2461 break;
2462 }
2463 case 'A':
2464 {
2465 PyObject *obj = va_arg(count, PyObject *);
2466 PyObject *ascii;
2467 assert(obj);
2468 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002469 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002470 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002472 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002473 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 /* Remember the repr and switch to the next slot */
2475 *callresult++ = ascii;
2476 break;
2477 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002478 default:
2479 /* if we stumble upon an unknown
2480 formatting code, copy the rest of
2481 the format string to the output
2482 string. (we cannot just skip the
2483 code, since there's no way to know
2484 what's in the argument list) */
2485 n += strlen(p);
2486 goto expand;
2487 }
2488 } else
2489 n++;
2490 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002491 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002492 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002493 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002494 we don't have to resize the string.
2495 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002496 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 if (!string)
2498 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 kind = PyUnicode_KIND(string);
2500 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002501 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002504 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002505 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002506 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002507
2508 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2510 /* checking for == because the last argument could be a empty
2511 string, which causes i to point to end, the assert at the end of
2512 the loop */
2513 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002514
Benjamin Peterson14339b62009-01-31 16:36:08 +00002515 switch (*f) {
2516 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002517 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 const int ordinal = va_arg(vargs, int);
2519 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002520 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002521 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002522 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002523 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002524 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526 case 'p':
2527 /* unused, since we already have the result */
2528 if (*f == 'p')
2529 (void) va_arg(vargs, void *);
2530 else
2531 (void) va_arg(vargs, int);
2532 /* extract the result from numberresults and append. */
2533 for (; *numberresult; ++i, ++numberresult)
2534 PyUnicode_WRITE(kind, data, i, *numberresult);
2535 /* skip over the separating '\0' */
2536 assert(*numberresult == '\0');
2537 numberresult++;
2538 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002539 break;
2540 case 's':
2541 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002542 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002544 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 size = PyUnicode_GET_LENGTH(*callresult);
2546 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002547 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002548 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002549 /* We're done with the unicode()/repr() => forget it */
2550 Py_DECREF(*callresult);
2551 /* switch to next unicode()/repr() result */
2552 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002553 break;
2554 }
2555 case 'U':
2556 {
2557 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002558 Py_ssize_t size;
2559 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2560 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002561 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002562 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 break;
2564 }
2565 case 'V':
2566 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002567 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002568 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002569 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002570 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002571 size = PyUnicode_GET_LENGTH(obj);
2572 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002573 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002575 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 size = PyUnicode_GET_LENGTH(*callresult);
2577 assert(PyUnicode_KIND(*callresult) <=
2578 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002579 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002580 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002581 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002582 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002583 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002584 break;
2585 }
2586 case 'S':
2587 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002588 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002590 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 /* unused, since we already have the result */
2592 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002594 copy_characters(string, i, *callresult, 0, size);
2595 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002596 /* We're done with the unicode()/repr() => forget it */
2597 Py_DECREF(*callresult);
2598 /* switch to next unicode()/repr() result */
2599 ++callresult;
2600 break;
2601 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002602 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002603 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002604 break;
2605 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002606 for (; *p; ++p, ++i)
2607 PyUnicode_WRITE(kind, data, i, *p);
2608 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002609 goto end;
2610 }
Victor Stinner1205f272010-09-11 00:54:47 +00002611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 else {
2613 assert(i < PyUnicode_GET_LENGTH(string));
2614 PyUnicode_WRITE(kind, data, i++, *f);
2615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002618
Benjamin Peterson29060642009-01-31 22:14:21 +00002619 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002620 if (callresults)
2621 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002622 if (numberresults)
2623 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002624 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002625 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002626 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002627 if (callresults) {
2628 PyObject **callresult2 = callresults;
2629 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002630 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002631 ++callresult2;
2632 }
2633 PyObject_Free(callresults);
2634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002635 if (numberresults)
2636 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002637 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002638}
2639
Walter Dörwaldd2034312007-05-18 16:29:38 +00002640PyObject *
2641PyUnicode_FromFormat(const char *format, ...)
2642{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002643 PyObject* ret;
2644 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002645
2646#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002647 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002648#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002649 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002650#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002651 ret = PyUnicode_FromFormatV(format, vargs);
2652 va_end(vargs);
2653 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002654}
2655
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002656#ifdef HAVE_WCHAR_H
2657
Victor Stinner5593d8a2010-10-02 11:11:27 +00002658/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2659 convert a Unicode object to a wide character string.
2660
Victor Stinnerd88d9832011-09-06 02:00:05 +02002661 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002662 character) required to convert the unicode object. Ignore size argument.
2663
Victor Stinnerd88d9832011-09-06 02:00:05 +02002664 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002665 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002666 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002667static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002668unicode_aswidechar(PyUnicodeObject *unicode,
2669 wchar_t *w,
2670 Py_ssize_t size)
2671{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002672 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002673 const wchar_t *wstr;
2674
2675 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2676 if (wstr == NULL)
2677 return -1;
2678
Victor Stinner5593d8a2010-10-02 11:11:27 +00002679 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002680 if (size > res)
2681 size = res + 1;
2682 else
2683 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002684 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002685 return res;
2686 }
2687 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002689}
2690
2691Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002692PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002693 wchar_t *w,
2694 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695{
2696 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002697 PyErr_BadInternalCall();
2698 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002699 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002700 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701}
2702
Victor Stinner137c34c2010-09-29 10:25:54 +00002703wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002704PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002705 Py_ssize_t *size)
2706{
2707 wchar_t* buffer;
2708 Py_ssize_t buflen;
2709
2710 if (unicode == NULL) {
2711 PyErr_BadInternalCall();
2712 return NULL;
2713 }
2714
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002715 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002716 if (buflen == -1)
2717 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002718 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002719 PyErr_NoMemory();
2720 return NULL;
2721 }
2722
Victor Stinner137c34c2010-09-29 10:25:54 +00002723 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2724 if (buffer == NULL) {
2725 PyErr_NoMemory();
2726 return NULL;
2727 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002728 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (buflen == -1)
2730 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002731 if (size != NULL)
2732 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002733 return buffer;
2734}
2735
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002736#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002737
Alexander Belopolsky40018472011-02-26 01:02:56 +00002738PyObject *
2739PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002742 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002743 PyErr_SetString(PyExc_ValueError,
2744 "chr() arg not in range(0x110000)");
2745 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002746 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002748 if (ordinal < 256)
2749 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002750
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002751 v = PyUnicode_New(1, ordinal);
2752 if (v == NULL)
2753 return NULL;
2754 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002755 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002757}
2758
Alexander Belopolsky40018472011-02-26 01:02:56 +00002759PyObject *
2760PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002761{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002762 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002764 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002765 if (PyUnicode_READY(obj))
2766 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002767 Py_INCREF(obj);
2768 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002769 }
2770 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002771 /* For a Unicode subtype that's not a Unicode object,
2772 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002773 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002774 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002775 PyErr_Format(PyExc_TypeError,
2776 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002777 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002778 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002779}
2780
Alexander Belopolsky40018472011-02-26 01:02:56 +00002781PyObject *
2782PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002783 const char *encoding,
2784 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002785{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002786 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002787 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002788
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002790 PyErr_BadInternalCall();
2791 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002793
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002794 /* Decoding bytes objects is the most common case and should be fast */
2795 if (PyBytes_Check(obj)) {
2796 if (PyBytes_GET_SIZE(obj) == 0) {
2797 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002798 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002799 }
2800 else {
2801 v = PyUnicode_Decode(
2802 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2803 encoding, errors);
2804 }
2805 return v;
2806 }
2807
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002808 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002809 PyErr_SetString(PyExc_TypeError,
2810 "decoding str is not supported");
2811 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002812 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002813
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002814 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2815 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2816 PyErr_Format(PyExc_TypeError,
2817 "coercing to str: need bytes, bytearray "
2818 "or buffer-like object, %.80s found",
2819 Py_TYPE(obj)->tp_name);
2820 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002821 }
Tim Petersced69f82003-09-16 20:30:58 +00002822
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002823 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002824 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002825 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 }
Tim Petersced69f82003-09-16 20:30:58 +00002827 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002828 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002829
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002830 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002831 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002832}
2833
Victor Stinner600d3be2010-06-10 12:00:55 +00002834/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002835 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2836 1 on success. */
2837static int
2838normalize_encoding(const char *encoding,
2839 char *lower,
2840 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002842 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002843 char *l;
2844 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002845
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002846 if (encoding == NULL) {
2847 strcpy(lower, "utf-8");
2848 return 1;
2849 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002850 e = encoding;
2851 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002852 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002853 while (*e) {
2854 if (l == l_end)
2855 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002856 if (Py_ISUPPER(*e)) {
2857 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002858 }
2859 else if (*e == '_') {
2860 *l++ = '-';
2861 e++;
2862 }
2863 else {
2864 *l++ = *e++;
2865 }
2866 }
2867 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002868 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002869}
2870
Alexander Belopolsky40018472011-02-26 01:02:56 +00002871PyObject *
2872PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002873 Py_ssize_t size,
2874 const char *encoding,
2875 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002876{
2877 PyObject *buffer = NULL, *unicode;
2878 Py_buffer info;
2879 char lower[11]; /* Enough for any encoding shortcut */
2880
Fred Drakee4315f52000-05-09 19:53:39 +00002881 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002882 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002883 if ((strcmp(lower, "utf-8") == 0) ||
2884 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002885 return PyUnicode_DecodeUTF8(s, size, errors);
2886 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002887 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002888 (strcmp(lower, "iso-8859-1") == 0))
2889 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002890#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002891 else if (strcmp(lower, "mbcs") == 0)
2892 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002893#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002894 else if (strcmp(lower, "ascii") == 0)
2895 return PyUnicode_DecodeASCII(s, size, errors);
2896 else if (strcmp(lower, "utf-16") == 0)
2897 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2898 else if (strcmp(lower, "utf-32") == 0)
2899 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2900 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
2902 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002903 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002904 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002905 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002906 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002907 if (buffer == NULL)
2908 goto onError;
2909 unicode = PyCodec_Decode(buffer, encoding, errors);
2910 if (unicode == NULL)
2911 goto onError;
2912 if (!PyUnicode_Check(unicode)) {
2913 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002914 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002915 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 Py_DECREF(unicode);
2917 goto onError;
2918 }
2919 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002920#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002921 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002922 Py_DECREF(unicode);
2923 return NULL;
2924 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002925#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002927 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002928
Benjamin Peterson29060642009-01-31 22:14:21 +00002929 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930 Py_XDECREF(buffer);
2931 return NULL;
2932}
2933
Alexander Belopolsky40018472011-02-26 01:02:56 +00002934PyObject *
2935PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002936 const char *encoding,
2937 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002938{
2939 PyObject *v;
2940
2941 if (!PyUnicode_Check(unicode)) {
2942 PyErr_BadArgument();
2943 goto onError;
2944 }
2945
2946 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002947 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002948
2949 /* Decode via the codec registry */
2950 v = PyCodec_Decode(unicode, encoding, errors);
2951 if (v == NULL)
2952 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002953 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002954 return v;
2955
Benjamin Peterson29060642009-01-31 22:14:21 +00002956 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002957 return NULL;
2958}
2959
Alexander Belopolsky40018472011-02-26 01:02:56 +00002960PyObject *
2961PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002962 const char *encoding,
2963 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002964{
2965 PyObject *v;
2966
2967 if (!PyUnicode_Check(unicode)) {
2968 PyErr_BadArgument();
2969 goto onError;
2970 }
2971
2972 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002973 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002974
2975 /* Decode via the codec registry */
2976 v = PyCodec_Decode(unicode, encoding, errors);
2977 if (v == NULL)
2978 goto onError;
2979 if (!PyUnicode_Check(v)) {
2980 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002981 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002982 Py_TYPE(v)->tp_name);
2983 Py_DECREF(v);
2984 goto onError;
2985 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002986 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002987 return v;
2988
Benjamin Peterson29060642009-01-31 22:14:21 +00002989 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002990 return NULL;
2991}
2992
Alexander Belopolsky40018472011-02-26 01:02:56 +00002993PyObject *
2994PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002995 Py_ssize_t size,
2996 const char *encoding,
2997 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998{
2999 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003000
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 unicode = PyUnicode_FromUnicode(s, size);
3002 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3005 Py_DECREF(unicode);
3006 return v;
3007}
3008
Alexander Belopolsky40018472011-02-26 01:02:56 +00003009PyObject *
3010PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003011 const char *encoding,
3012 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003013{
3014 PyObject *v;
3015
3016 if (!PyUnicode_Check(unicode)) {
3017 PyErr_BadArgument();
3018 goto onError;
3019 }
3020
3021 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003022 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003023
3024 /* Encode via the codec registry */
3025 v = PyCodec_Encode(unicode, encoding, errors);
3026 if (v == NULL)
3027 goto onError;
3028 return v;
3029
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003031 return NULL;
3032}
3033
Victor Stinnerad158722010-10-27 00:25:46 +00003034PyObject *
3035PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003036{
Victor Stinner99b95382011-07-04 14:23:54 +02003037#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003038 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3039 PyUnicode_GET_SIZE(unicode),
3040 NULL);
3041#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003042 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003043#else
Victor Stinner793b5312011-04-27 00:24:21 +02003044 PyInterpreterState *interp = PyThreadState_GET()->interp;
3045 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3046 cannot use it to encode and decode filenames before it is loaded. Load
3047 the Python codec requires to encode at least its own filename. Use the C
3048 version of the locale codec until the codec registry is initialized and
3049 the Python codec is loaded.
3050
3051 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3052 cannot only rely on it: check also interp->fscodec_initialized for
3053 subinterpreters. */
3054 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003055 return PyUnicode_AsEncodedString(unicode,
3056 Py_FileSystemDefaultEncoding,
3057 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003058 }
3059 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003060 /* locale encoding with surrogateescape */
3061 wchar_t *wchar;
3062 char *bytes;
3063 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003064 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003065
3066 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3067 if (wchar == NULL)
3068 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003069 bytes = _Py_wchar2char(wchar, &error_pos);
3070 if (bytes == NULL) {
3071 if (error_pos != (size_t)-1) {
3072 char *errmsg = strerror(errno);
3073 PyObject *exc = NULL;
3074 if (errmsg == NULL)
3075 errmsg = "Py_wchar2char() failed";
3076 raise_encode_exception(&exc,
3077 "filesystemencoding",
3078 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
3079 error_pos, error_pos+1,
3080 errmsg);
3081 Py_XDECREF(exc);
3082 }
3083 else
3084 PyErr_NoMemory();
3085 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003086 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003087 }
3088 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003089
3090 bytes_obj = PyBytes_FromString(bytes);
3091 PyMem_Free(bytes);
3092 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003093 }
Victor Stinnerad158722010-10-27 00:25:46 +00003094#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101{
3102 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003103 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003104
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 if (!PyUnicode_Check(unicode)) {
3106 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003107 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003108 }
Fred Drakee4315f52000-05-09 19:53:39 +00003109
Fred Drakee4315f52000-05-09 19:53:39 +00003110 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003111 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003112 if ((strcmp(lower, "utf-8") == 0) ||
3113 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003114 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003115 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003116 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003117 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003119 }
Victor Stinner37296e82010-06-10 13:36:23 +00003120 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003121 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003122 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003124#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00003125 else if (strcmp(lower, "mbcs") == 0)
3126 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
3127 PyUnicode_GET_SIZE(unicode),
3128 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003129#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003130 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003131 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003132 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003133
3134 /* Encode via the codec registry */
3135 v = PyCodec_Encode(unicode, encoding, errors);
3136 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003137 return NULL;
3138
3139 /* The normal path */
3140 if (PyBytes_Check(v))
3141 return v;
3142
3143 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003144 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003145 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003146 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003147
3148 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3149 "encoder %s returned bytearray instead of bytes",
3150 encoding);
3151 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003152 Py_DECREF(v);
3153 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003154 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003155
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003156 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3157 Py_DECREF(v);
3158 return b;
3159 }
3160
3161 PyErr_Format(PyExc_TypeError,
3162 "encoder did not return a bytes object (type=%.400s)",
3163 Py_TYPE(v)->tp_name);
3164 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003165 return NULL;
3166}
3167
Alexander Belopolsky40018472011-02-26 01:02:56 +00003168PyObject *
3169PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003170 const char *encoding,
3171 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003172{
3173 PyObject *v;
3174
3175 if (!PyUnicode_Check(unicode)) {
3176 PyErr_BadArgument();
3177 goto onError;
3178 }
3179
3180 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003182
3183 /* Encode via the codec registry */
3184 v = PyCodec_Encode(unicode, encoding, errors);
3185 if (v == NULL)
3186 goto onError;
3187 if (!PyUnicode_Check(v)) {
3188 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003189 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003190 Py_TYPE(v)->tp_name);
3191 Py_DECREF(v);
3192 goto onError;
3193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003195
Benjamin Peterson29060642009-01-31 22:14:21 +00003196 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 return NULL;
3198}
3199
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003200PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003201PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003202 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003203 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3204}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003205
Christian Heimes5894ba72007-11-04 11:43:14 +00003206PyObject*
3207PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3208{
Victor Stinner99b95382011-07-04 14:23:54 +02003209#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003210 return PyUnicode_DecodeMBCS(s, size, NULL);
3211#elif defined(__APPLE__)
3212 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3213#else
Victor Stinner793b5312011-04-27 00:24:21 +02003214 PyInterpreterState *interp = PyThreadState_GET()->interp;
3215 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3216 cannot use it to encode and decode filenames before it is loaded. Load
3217 the Python codec requires to encode at least its own filename. Use the C
3218 version of the locale codec until the codec registry is initialized and
3219 the Python codec is loaded.
3220
3221 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3222 cannot only rely on it: check also interp->fscodec_initialized for
3223 subinterpreters. */
3224 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003225 return PyUnicode_Decode(s, size,
3226 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003227 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003228 }
3229 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003230 /* locale encoding with surrogateescape */
3231 wchar_t *wchar;
3232 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003233 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003234
3235 if (s[size] != '\0' || size != strlen(s)) {
3236 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3237 return NULL;
3238 }
3239
Victor Stinner168e1172010-10-16 23:16:16 +00003240 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003241 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003242 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003243
Victor Stinner168e1172010-10-16 23:16:16 +00003244 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003245 PyMem_Free(wchar);
3246 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003247 }
Victor Stinnerad158722010-10-27 00:25:46 +00003248#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003249}
3250
Martin v. Löwis011e8422009-05-05 04:43:17 +00003251
3252int
3253PyUnicode_FSConverter(PyObject* arg, void* addr)
3254{
3255 PyObject *output = NULL;
3256 Py_ssize_t size;
3257 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003258 if (arg == NULL) {
3259 Py_DECREF(*(PyObject**)addr);
3260 return 1;
3261 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003262 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003263 output = arg;
3264 Py_INCREF(output);
3265 }
3266 else {
3267 arg = PyUnicode_FromObject(arg);
3268 if (!arg)
3269 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003270 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003271 Py_DECREF(arg);
3272 if (!output)
3273 return 0;
3274 if (!PyBytes_Check(output)) {
3275 Py_DECREF(output);
3276 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3277 return 0;
3278 }
3279 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003280 size = PyBytes_GET_SIZE(output);
3281 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003282 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003283 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003284 Py_DECREF(output);
3285 return 0;
3286 }
3287 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003288 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003289}
3290
3291
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003292int
3293PyUnicode_FSDecoder(PyObject* arg, void* addr)
3294{
3295 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003296 if (arg == NULL) {
3297 Py_DECREF(*(PyObject**)addr);
3298 return 1;
3299 }
3300 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 if (PyUnicode_READY(arg))
3302 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003303 output = arg;
3304 Py_INCREF(output);
3305 }
3306 else {
3307 arg = PyBytes_FromObject(arg);
3308 if (!arg)
3309 return 0;
3310 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3311 PyBytes_GET_SIZE(arg));
3312 Py_DECREF(arg);
3313 if (!output)
3314 return 0;
3315 if (!PyUnicode_Check(output)) {
3316 Py_DECREF(output);
3317 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3318 return 0;
3319 }
3320 }
Victor Stinner065836e2011-10-27 01:56:33 +02003321 if (PyUnicode_READY(output) < 0) {
3322 Py_DECREF(output);
3323 return 0;
3324 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003325 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003326 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003327 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3328 Py_DECREF(output);
3329 return 0;
3330 }
3331 *(PyObject**)addr = output;
3332 return Py_CLEANUP_SUPPORTED;
3333}
3334
3335
Martin v. Löwis5b222132007-06-10 09:51:05 +00003336char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003337PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003338{
Christian Heimesf3863112007-11-22 07:46:41 +00003339 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3341
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003342 if (!PyUnicode_Check(unicode)) {
3343 PyErr_BadArgument();
3344 return NULL;
3345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003347 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003349 if (PyUnicode_UTF8(unicode) == NULL) {
3350 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3352 if (bytes == NULL)
3353 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003354 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3355 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003356 Py_DECREF(bytes);
3357 return NULL;
3358 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003359 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3360 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003361 Py_DECREF(bytes);
3362 }
3363
3364 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003365 *psize = PyUnicode_UTF8_LENGTH(unicode);
3366 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003367}
3368
3369char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003370PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003371{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003372 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3373}
3374
3375#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003376static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003377#endif
3378
3379
3380Py_UNICODE *
3381PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3382{
3383 PyUnicodeObject *u;
3384 const unsigned char *one_byte;
3385#if SIZEOF_WCHAR_T == 4
3386 const Py_UCS2 *two_bytes;
3387#else
3388 const Py_UCS4 *four_bytes;
3389 const Py_UCS4 *ucs4_end;
3390 Py_ssize_t num_surrogates;
3391#endif
3392 wchar_t *w;
3393 wchar_t *wchar_end;
3394
3395 if (!PyUnicode_Check(unicode)) {
3396 PyErr_BadArgument();
3397 return NULL;
3398 }
3399 u = (PyUnicodeObject*)unicode;
3400 if (_PyUnicode_WSTR(u) == NULL) {
3401 /* Non-ASCII compact unicode object */
3402 assert(_PyUnicode_KIND(u) != 0);
3403 assert(PyUnicode_IS_READY(u));
3404
3405#ifdef Py_DEBUG
3406 ++unicode_as_unicode_calls;
3407#endif
3408
3409 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3410#if SIZEOF_WCHAR_T == 2
3411 four_bytes = PyUnicode_4BYTE_DATA(u);
3412 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3413 num_surrogates = 0;
3414
3415 for (; four_bytes < ucs4_end; ++four_bytes) {
3416 if (*four_bytes > 0xFFFF)
3417 ++num_surrogates;
3418 }
3419
3420 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3421 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3422 if (!_PyUnicode_WSTR(u)) {
3423 PyErr_NoMemory();
3424 return NULL;
3425 }
3426 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3427
3428 w = _PyUnicode_WSTR(u);
3429 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3430 four_bytes = PyUnicode_4BYTE_DATA(u);
3431 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3432 if (*four_bytes > 0xFFFF) {
3433 /* encode surrogate pair in this case */
3434 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3435 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3436 }
3437 else
3438 *w = *four_bytes;
3439
3440 if (w > wchar_end) {
3441 assert(0 && "Miscalculated string end");
3442 }
3443 }
3444 *w = 0;
3445#else
3446 /* sizeof(wchar_t) == 4 */
3447 Py_FatalError("Impossible unicode object state, wstr and str "
3448 "should share memory already.");
3449 return NULL;
3450#endif
3451 }
3452 else {
3453 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3454 (_PyUnicode_LENGTH(u) + 1));
3455 if (!_PyUnicode_WSTR(u)) {
3456 PyErr_NoMemory();
3457 return NULL;
3458 }
3459 if (!PyUnicode_IS_COMPACT_ASCII(u))
3460 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3461 w = _PyUnicode_WSTR(u);
3462 wchar_end = w + _PyUnicode_LENGTH(u);
3463
3464 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3465 one_byte = PyUnicode_1BYTE_DATA(u);
3466 for (; w < wchar_end; ++one_byte, ++w)
3467 *w = *one_byte;
3468 /* null-terminate the wstr */
3469 *w = 0;
3470 }
3471 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3472#if SIZEOF_WCHAR_T == 4
3473 two_bytes = PyUnicode_2BYTE_DATA(u);
3474 for (; w < wchar_end; ++two_bytes, ++w)
3475 *w = *two_bytes;
3476 /* null-terminate the wstr */
3477 *w = 0;
3478#else
3479 /* sizeof(wchar_t) == 2 */
3480 PyObject_FREE(_PyUnicode_WSTR(u));
3481 _PyUnicode_WSTR(u) = NULL;
3482 Py_FatalError("Impossible unicode object state, wstr "
3483 "and str should share memory already.");
3484 return NULL;
3485#endif
3486 }
3487 else {
3488 assert(0 && "This should never happen.");
3489 }
3490 }
3491 }
3492 if (size != NULL)
3493 *size = PyUnicode_WSTR_LENGTH(u);
3494 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003495}
3496
Alexander Belopolsky40018472011-02-26 01:02:56 +00003497Py_UNICODE *
3498PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003500 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501}
3502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003503
Alexander Belopolsky40018472011-02-26 01:02:56 +00003504Py_ssize_t
3505PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506{
3507 if (!PyUnicode_Check(unicode)) {
3508 PyErr_BadArgument();
3509 goto onError;
3510 }
3511 return PyUnicode_GET_SIZE(unicode);
3512
Benjamin Peterson29060642009-01-31 22:14:21 +00003513 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 return -1;
3515}
3516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517Py_ssize_t
3518PyUnicode_GetLength(PyObject *unicode)
3519{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003520 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003521 PyErr_BadArgument();
3522 return -1;
3523 }
3524
3525 return PyUnicode_GET_LENGTH(unicode);
3526}
3527
3528Py_UCS4
3529PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3530{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003531 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3532 PyErr_BadArgument();
3533 return (Py_UCS4)-1;
3534 }
3535 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3536 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003537 return (Py_UCS4)-1;
3538 }
3539 return PyUnicode_READ_CHAR(unicode, index);
3540}
3541
3542int
3543PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3544{
3545 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003546 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003547 return -1;
3548 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003549 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3550 PyErr_SetString(PyExc_IndexError, "string index out of range");
3551 return -1;
3552 }
3553 if (_PyUnicode_Dirty(unicode))
3554 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003555 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3556 index, ch);
3557 return 0;
3558}
3559
Alexander Belopolsky40018472011-02-26 01:02:56 +00003560const char *
3561PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003562{
Victor Stinner42cb4622010-09-01 19:39:01 +00003563 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003564}
3565
Victor Stinner554f3f02010-06-16 23:33:54 +00003566/* create or adjust a UnicodeDecodeError */
3567static void
3568make_decode_exception(PyObject **exceptionObject,
3569 const char *encoding,
3570 const char *input, Py_ssize_t length,
3571 Py_ssize_t startpos, Py_ssize_t endpos,
3572 const char *reason)
3573{
3574 if (*exceptionObject == NULL) {
3575 *exceptionObject = PyUnicodeDecodeError_Create(
3576 encoding, input, length, startpos, endpos, reason);
3577 }
3578 else {
3579 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3580 goto onError;
3581 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3582 goto onError;
3583 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3584 goto onError;
3585 }
3586 return;
3587
3588onError:
3589 Py_DECREF(*exceptionObject);
3590 *exceptionObject = NULL;
3591}
3592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003593/* error handling callback helper:
3594 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003595 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003596 and adjust various state variables.
3597 return 0 on success, -1 on error
3598*/
3599
Alexander Belopolsky40018472011-02-26 01:02:56 +00003600static int
3601unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003602 const char *encoding, const char *reason,
3603 const char **input, const char **inend, Py_ssize_t *startinpos,
3604 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3605 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003606{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003607 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003608
3609 PyObject *restuple = NULL;
3610 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003611 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003612 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003613 Py_ssize_t requiredsize;
3614 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003615 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003616 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003617 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003618 int res = -1;
3619
3620 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003621 *errorHandler = PyCodec_LookupError(errors);
3622 if (*errorHandler == NULL)
3623 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003624 }
3625
Victor Stinner554f3f02010-06-16 23:33:54 +00003626 make_decode_exception(exceptionObject,
3627 encoding,
3628 *input, *inend - *input,
3629 *startinpos, *endinpos,
3630 reason);
3631 if (*exceptionObject == NULL)
3632 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633
3634 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3635 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003636 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003637 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003638 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003639 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 }
3641 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003642 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003643
3644 /* Copy back the bytes variables, which might have been modified by the
3645 callback */
3646 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3647 if (!inputobj)
3648 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003649 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003650 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003651 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003652 *input = PyBytes_AS_STRING(inputobj);
3653 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003654 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003655 /* we can DECREF safely, as the exception has another reference,
3656 so the object won't go away. */
3657 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003660 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003661 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003662 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3663 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003664 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003665
3666 /* need more space? (at least enough for what we
3667 have+the replacement+the rest of the string (starting
3668 at the new input position), so we won't have to check space
3669 when there are no errors in the rest of the string) */
3670 repptr = PyUnicode_AS_UNICODE(repunicode);
3671 repsize = PyUnicode_GET_SIZE(repunicode);
3672 requiredsize = *outpos + repsize + insize-newpos;
3673 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003674 if (requiredsize<2*outsize)
3675 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003676 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003677 goto onError;
3678 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003679 }
3680 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003681 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003682 Py_UNICODE_COPY(*outptr, repptr, repsize);
3683 *outptr += repsize;
3684 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003686 /* we made it! */
3687 res = 0;
3688
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 Py_XDECREF(restuple);
3691 return res;
3692}
3693
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003694/* --- UTF-7 Codec -------------------------------------------------------- */
3695
Antoine Pitrou244651a2009-05-04 18:56:13 +00003696/* See RFC2152 for details. We encode conservatively and decode liberally. */
3697
3698/* Three simple macros defining base-64. */
3699
3700/* Is c a base-64 character? */
3701
3702#define IS_BASE64(c) \
3703 (((c) >= 'A' && (c) <= 'Z') || \
3704 ((c) >= 'a' && (c) <= 'z') || \
3705 ((c) >= '0' && (c) <= '9') || \
3706 (c) == '+' || (c) == '/')
3707
3708/* given that c is a base-64 character, what is its base-64 value? */
3709
3710#define FROM_BASE64(c) \
3711 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3712 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3713 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3714 (c) == '+' ? 62 : 63)
3715
3716/* What is the base-64 character of the bottom 6 bits of n? */
3717
3718#define TO_BASE64(n) \
3719 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3720
3721/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3722 * decoded as itself. We are permissive on decoding; the only ASCII
3723 * byte not decoding to itself is the + which begins a base64
3724 * string. */
3725
3726#define DECODE_DIRECT(c) \
3727 ((c) <= 127 && (c) != '+')
3728
3729/* The UTF-7 encoder treats ASCII characters differently according to
3730 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3731 * the above). See RFC2152. This array identifies these different
3732 * sets:
3733 * 0 : "Set D"
3734 * alphanumeric and '(),-./:?
3735 * 1 : "Set O"
3736 * !"#$%&*;<=>@[]^_`{|}
3737 * 2 : "whitespace"
3738 * ht nl cr sp
3739 * 3 : special (must be base64 encoded)
3740 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3741 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003742
Tim Petersced69f82003-09-16 20:30:58 +00003743static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003744char utf7_category[128] = {
3745/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3746 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3747/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3748 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3749/* sp ! " # $ % & ' ( ) * + , - . / */
3750 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3751/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3752 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3753/* @ A B C D E F G H I J K L M N O */
3754 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3755/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3756 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3757/* ` a b c d e f g h i j k l m n o */
3758 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3759/* p q r s t u v w x y z { | } ~ del */
3760 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003761};
3762
Antoine Pitrou244651a2009-05-04 18:56:13 +00003763/* ENCODE_DIRECT: this character should be encoded as itself. The
3764 * answer depends on whether we are encoding set O as itself, and also
3765 * on whether we are encoding whitespace as itself. RFC2152 makes it
3766 * clear that the answers to these questions vary between
3767 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003768
Antoine Pitrou244651a2009-05-04 18:56:13 +00003769#define ENCODE_DIRECT(c, directO, directWS) \
3770 ((c) < 128 && (c) > 0 && \
3771 ((utf7_category[(c)] == 0) || \
3772 (directWS && (utf7_category[(c)] == 2)) || \
3773 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003774
Alexander Belopolsky40018472011-02-26 01:02:56 +00003775PyObject *
3776PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003777 Py_ssize_t size,
3778 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003779{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003780 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3781}
3782
Antoine Pitrou244651a2009-05-04 18:56:13 +00003783/* The decoder. The only state we preserve is our read position,
3784 * i.e. how many characters we have consumed. So if we end in the
3785 * middle of a shift sequence we have to back off the read position
3786 * and the output to the beginning of the sequence, otherwise we lose
3787 * all the shift state (seen bits, number of bits seen, high
3788 * surrogate). */
3789
Alexander Belopolsky40018472011-02-26 01:02:56 +00003790PyObject *
3791PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003792 Py_ssize_t size,
3793 const char *errors,
3794 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003795{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003796 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003797 Py_ssize_t startinpos;
3798 Py_ssize_t endinpos;
3799 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800 const char *e;
3801 PyUnicodeObject *unicode;
3802 Py_UNICODE *p;
3803 const char *errmsg = "";
3804 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003805 Py_UNICODE *shiftOutStart;
3806 unsigned int base64bits = 0;
3807 unsigned long base64buffer = 0;
3808 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003809 PyObject *errorHandler = NULL;
3810 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003811
3812 unicode = _PyUnicode_New(size);
3813 if (!unicode)
3814 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003815 if (size == 0) {
3816 if (consumed)
3817 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003818 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003819 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003822 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003823 e = s + size;
3824
3825 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003826 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003827 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003828 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003829
Antoine Pitrou244651a2009-05-04 18:56:13 +00003830 if (inShift) { /* in a base-64 section */
3831 if (IS_BASE64(ch)) { /* consume a base-64 character */
3832 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3833 base64bits += 6;
3834 s++;
3835 if (base64bits >= 16) {
3836 /* we have enough bits for a UTF-16 value */
3837 Py_UNICODE outCh = (Py_UNICODE)
3838 (base64buffer >> (base64bits-16));
3839 base64bits -= 16;
3840 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3841 if (surrogate) {
3842 /* expecting a second surrogate */
3843 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3844#ifdef Py_UNICODE_WIDE
3845 *p++ = (((surrogate & 0x3FF)<<10)
3846 | (outCh & 0x3FF)) + 0x10000;
3847#else
3848 *p++ = surrogate;
3849 *p++ = outCh;
3850#endif
3851 surrogate = 0;
3852 }
3853 else {
3854 surrogate = 0;
3855 errmsg = "second surrogate missing";
3856 goto utf7Error;
3857 }
3858 }
3859 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3860 /* first surrogate */
3861 surrogate = outCh;
3862 }
3863 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3864 errmsg = "unexpected second surrogate";
3865 goto utf7Error;
3866 }
3867 else {
3868 *p++ = outCh;
3869 }
3870 }
3871 }
3872 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003873 inShift = 0;
3874 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003875 if (surrogate) {
3876 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003877 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003878 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003879 if (base64bits > 0) { /* left-over bits */
3880 if (base64bits >= 6) {
3881 /* We've seen at least one base-64 character */
3882 errmsg = "partial character in shift sequence";
3883 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003884 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003885 else {
3886 /* Some bits remain; they should be zero */
3887 if (base64buffer != 0) {
3888 errmsg = "non-zero padding bits in shift sequence";
3889 goto utf7Error;
3890 }
3891 }
3892 }
3893 if (ch != '-') {
3894 /* '-' is absorbed; other terminating
3895 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003896 *p++ = ch;
3897 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003898 }
3899 }
3900 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003901 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003902 s++; /* consume '+' */
3903 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003904 s++;
3905 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003906 }
3907 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003908 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003909 shiftOutStart = p;
3910 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003911 }
3912 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003913 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003914 *p++ = ch;
3915 s++;
3916 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003917 else {
3918 startinpos = s-starts;
3919 s++;
3920 errmsg = "unexpected special character";
3921 goto utf7Error;
3922 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003923 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003925 outpos = p-PyUnicode_AS_UNICODE(unicode);
3926 endinpos = s-starts;
3927 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003928 errors, &errorHandler,
3929 "utf7", errmsg,
3930 &starts, &e, &startinpos, &endinpos, &exc, &s,
3931 &unicode, &outpos, &p))
3932 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003933 }
3934
Antoine Pitrou244651a2009-05-04 18:56:13 +00003935 /* end of string */
3936
3937 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3938 /* if we're in an inconsistent state, that's an error */
3939 if (surrogate ||
3940 (base64bits >= 6) ||
3941 (base64bits > 0 && base64buffer != 0)) {
3942 outpos = p-PyUnicode_AS_UNICODE(unicode);
3943 endinpos = size;
3944 if (unicode_decode_call_errorhandler(
3945 errors, &errorHandler,
3946 "utf7", "unterminated shift sequence",
3947 &starts, &e, &startinpos, &endinpos, &exc, &s,
3948 &unicode, &outpos, &p))
3949 goto onError;
3950 if (s < e)
3951 goto restart;
3952 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003953 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003954
3955 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003956 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003957 if (inShift) {
3958 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003959 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003960 }
3961 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003962 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003963 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003964 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003965
Victor Stinnerfe226c02011-10-03 03:52:20 +02003966 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003967 goto onError;
3968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003969 Py_XDECREF(errorHandler);
3970 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003971#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003972 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003973 Py_DECREF(unicode);
3974 return NULL;
3975 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003976#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003977 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003978 return (PyObject *)unicode;
3979
Benjamin Peterson29060642009-01-31 22:14:21 +00003980 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003981 Py_XDECREF(errorHandler);
3982 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983 Py_DECREF(unicode);
3984 return NULL;
3985}
3986
3987
Alexander Belopolsky40018472011-02-26 01:02:56 +00003988PyObject *
3989PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003990 Py_ssize_t size,
3991 int base64SetO,
3992 int base64WhiteSpace,
3993 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003994{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003995 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003996 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003997 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003998 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003999 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004000 unsigned int base64bits = 0;
4001 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004002 char * out;
4003 char * start;
4004
4005 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004006 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004007
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00004008 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004009 return PyErr_NoMemory();
4010
Antoine Pitrou244651a2009-05-04 18:56:13 +00004011 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004012 if (v == NULL)
4013 return NULL;
4014
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004015 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004016 for (;i < size; ++i) {
4017 Py_UNICODE ch = s[i];
4018
Antoine Pitrou244651a2009-05-04 18:56:13 +00004019 if (inShift) {
4020 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4021 /* shifting out */
4022 if (base64bits) { /* output remaining bits */
4023 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4024 base64buffer = 0;
4025 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004026 }
4027 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004028 /* Characters not in the BASE64 set implicitly unshift the sequence
4029 so no '-' is required, except if the character is itself a '-' */
4030 if (IS_BASE64(ch) || ch == '-') {
4031 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004032 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004033 *out++ = (char) ch;
4034 }
4035 else {
4036 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004037 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004038 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004039 else { /* not in a shift sequence */
4040 if (ch == '+') {
4041 *out++ = '+';
4042 *out++ = '-';
4043 }
4044 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4045 *out++ = (char) ch;
4046 }
4047 else {
4048 *out++ = '+';
4049 inShift = 1;
4050 goto encode_char;
4051 }
4052 }
4053 continue;
4054encode_char:
4055#ifdef Py_UNICODE_WIDE
4056 if (ch >= 0x10000) {
4057 /* code first surrogate */
4058 base64bits += 16;
4059 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4060 while (base64bits >= 6) {
4061 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4062 base64bits -= 6;
4063 }
4064 /* prepare second surrogate */
4065 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
4066 }
4067#endif
4068 base64bits += 16;
4069 base64buffer = (base64buffer << 16) | ch;
4070 while (base64bits >= 6) {
4071 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4072 base64bits -= 6;
4073 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004074 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004075 if (base64bits)
4076 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4077 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004079 if (_PyBytes_Resize(&v, out - start) < 0)
4080 return NULL;
4081 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004082}
4083
Antoine Pitrou244651a2009-05-04 18:56:13 +00004084#undef IS_BASE64
4085#undef FROM_BASE64
4086#undef TO_BASE64
4087#undef DECODE_DIRECT
4088#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004089
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090/* --- UTF-8 Codec -------------------------------------------------------- */
4091
Tim Petersced69f82003-09-16 20:30:58 +00004092static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004094 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4095 illegal prefix. See RFC 3629 for details */
4096 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4097 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004098 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4100 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4101 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4102 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004103 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4108 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4109 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4110 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4111 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112};
4113
Alexander Belopolsky40018472011-02-26 01:02:56 +00004114PyObject *
4115PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004116 Py_ssize_t size,
4117 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118{
Walter Dörwald69652032004-09-07 20:24:22 +00004119 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4120}
4121
Antoine Pitrouab868312009-01-10 15:40:25 +00004122/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4123#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4124
4125/* Mask to quickly check whether a C 'long' contains a
4126 non-ASCII, UTF8-encoded char. */
4127#if (SIZEOF_LONG == 8)
4128# define ASCII_CHAR_MASK 0x8080808080808080L
4129#elif (SIZEOF_LONG == 4)
4130# define ASCII_CHAR_MASK 0x80808080L
4131#else
4132# error C 'long' size should be either 4 or 8!
4133#endif
4134
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004135/* Scans a UTF-8 string and returns the maximum character to be expected,
4136 the size of the decoded unicode string and if any major errors were
4137 encountered.
4138
4139 This function does check basic UTF-8 sanity, it does however NOT CHECK
4140 if the string contains surrogates, and if all continuation bytes are
4141 within the correct ranges, these checks are performed in
4142 PyUnicode_DecodeUTF8Stateful.
4143
4144 If it sets has_errors to 1, it means the value of unicode_size and max_char
4145 will be bogus and you should not rely on useful information in them.
4146 */
4147static Py_UCS4
4148utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
4149 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
4150 int *has_errors)
4151{
4152 Py_ssize_t n;
4153 Py_ssize_t char_count = 0;
4154 Py_UCS4 max_char = 127, new_max;
4155 Py_UCS4 upper_bound;
4156 const unsigned char *p = (const unsigned char *)s;
4157 const unsigned char *end = p + string_size;
4158 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
4159 int err = 0;
4160
4161 for (; p < end && !err; ++p, ++char_count) {
4162 /* Only check value if it's not a ASCII char... */
4163 if (*p < 0x80) {
4164 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4165 an explanation. */
4166 if (!((size_t) p & LONG_PTR_MASK)) {
4167 /* Help register allocation */
4168 register const unsigned char *_p = p;
4169 while (_p < aligned_end) {
4170 unsigned long value = *(unsigned long *) _p;
4171 if (value & ASCII_CHAR_MASK)
4172 break;
4173 _p += SIZEOF_LONG;
4174 char_count += SIZEOF_LONG;
4175 }
4176 p = _p;
4177 if (p == end)
4178 break;
4179 }
4180 }
4181 if (*p >= 0x80) {
4182 n = utf8_code_length[*p];
4183 new_max = max_char;
4184 switch (n) {
4185 /* invalid start byte */
4186 case 0:
4187 err = 1;
4188 break;
4189 case 2:
4190 /* Code points between 0x00FF and 0x07FF inclusive.
4191 Approximate the upper bound of the code point,
4192 if this flips over 255 we can be sure it will be more
4193 than 255 and the string will need 2 bytes per code coint,
4194 if it stays under or equal to 255, we can be sure 1 byte
4195 is enough.
4196 ((*p & 0b00011111) << 6) | 0b00111111 */
4197 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
4198 if (max_char < upper_bound)
4199 new_max = upper_bound;
4200 /* Ensure we track at least that we left ASCII space. */
4201 if (new_max < 128)
4202 new_max = 128;
4203 break;
4204 case 3:
4205 /* Between 0x0FFF and 0xFFFF inclusive, so values are
4206 always > 255 and <= 65535 and will always need 2 bytes. */
4207 if (max_char < 65535)
4208 new_max = 65535;
4209 break;
4210 case 4:
4211 /* Code point will be above 0xFFFF for sure in this case. */
4212 new_max = 65537;
4213 break;
4214 /* Internal error, this should be caught by the first if */
4215 case 1:
4216 default:
4217 assert(0 && "Impossible case in utf8_max_char_and_size");
4218 err = 1;
4219 }
4220 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004221 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004222 --n;
4223 /* Check if the follow up chars are all valid continuation bytes */
4224 if (n >= 1) {
4225 const unsigned char *cont;
4226 if ((p + n) >= end) {
4227 if (consumed == 0)
4228 /* incomplete data, non-incremental decoding */
4229 err = 1;
4230 break;
4231 }
4232 for (cont = p + 1; cont < (p + n); ++cont) {
4233 if ((*cont & 0xc0) != 0x80) {
4234 err = 1;
4235 break;
4236 }
4237 }
4238 p += n;
4239 }
4240 else
4241 err = 1;
4242 max_char = new_max;
4243 }
4244 }
4245
4246 if (unicode_size)
4247 *unicode_size = char_count;
4248 if (has_errors)
4249 *has_errors = err;
4250 return max_char;
4251}
4252
4253/* Similar to PyUnicode_WRITE but can also write into wstr field
4254 of the legacy unicode representation */
4255#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4256 do { \
4257 const int k_ = (kind); \
4258 if (k_ == PyUnicode_WCHAR_KIND) \
4259 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4260 else if (k_ == PyUnicode_1BYTE_KIND) \
4261 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4262 else if (k_ == PyUnicode_2BYTE_KIND) \
4263 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4264 else \
4265 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4266 } while (0)
4267
Alexander Belopolsky40018472011-02-26 01:02:56 +00004268PyObject *
4269PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270 Py_ssize_t size,
4271 const char *errors,
4272 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004273{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004276 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004277 Py_ssize_t startinpos;
4278 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004279 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004281 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 PyObject *errorHandler = NULL;
4283 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004284 Py_UCS4 maxchar = 0;
4285 Py_ssize_t unicode_size;
4286 Py_ssize_t i;
4287 int kind;
4288 void *data;
4289 int has_errors;
4290 Py_UNICODE *error_outptr;
4291#if SIZEOF_WCHAR_T == 2
4292 Py_ssize_t wchar_offset = 0;
4293#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294
Walter Dörwald69652032004-09-07 20:24:22 +00004295 if (size == 0) {
4296 if (consumed)
4297 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004298 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004299 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004300 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4301 consumed, &has_errors);
4302 if (has_errors) {
4303 unicode = _PyUnicode_New(size);
4304 if (!unicode)
4305 return NULL;
4306 kind = PyUnicode_WCHAR_KIND;
4307 data = PyUnicode_AS_UNICODE(unicode);
4308 assert(data != NULL);
4309 }
4310 else {
4311 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4312 if (!unicode)
4313 return NULL;
4314 /* When the string is ASCII only, just use memcpy and return.
4315 unicode_size may be != size if there is an incomplete UTF-8
4316 sequence at the end of the ASCII block. */
4317 if (maxchar < 128 && size == unicode_size) {
4318 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4319 return (PyObject *)unicode;
4320 }
4321 kind = PyUnicode_KIND(unicode);
4322 data = PyUnicode_DATA(unicode);
4323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004325 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004327 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
4329 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004330 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331
4332 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004333 /* Fast path for runs of ASCII characters. Given that common UTF-8
4334 input will consist of an overwhelming majority of ASCII
4335 characters, we try to optimize for this case by checking
4336 as many characters as a C 'long' can contain.
4337 First, check if we can do an aligned read, as most CPUs have
4338 a penalty for unaligned reads.
4339 */
4340 if (!((size_t) s & LONG_PTR_MASK)) {
4341 /* Help register allocation */
4342 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004343 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004344 while (_s < aligned_end) {
4345 /* Read a whole long at a time (either 4 or 8 bytes),
4346 and do a fast unrolled copy if it only contains ASCII
4347 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004348 unsigned long value = *(unsigned long *) _s;
4349 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004350 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004351 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4352 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4353 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4354 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004355#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004356 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4357 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4358 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4359 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004360#endif
4361 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004363 }
4364 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004365 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004366 if (s == e)
4367 break;
4368 ch = (unsigned char)*s;
4369 }
4370 }
4371
4372 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004373 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 s++;
4375 continue;
4376 }
4377
4378 n = utf8_code_length[ch];
4379
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004380 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004381 if (consumed)
4382 break;
4383 else {
4384 errmsg = "unexpected end of data";
4385 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004386 endinpos = startinpos+1;
4387 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4388 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004389 goto utf8Error;
4390 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004391 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392
4393 switch (n) {
4394
4395 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004396 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 startinpos = s-starts;
4398 endinpos = startinpos+1;
4399 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400
4401 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004402 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004403 startinpos = s-starts;
4404 endinpos = startinpos+1;
4405 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406
4407 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004408 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004409 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004411 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004412 goto utf8Error;
4413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004415 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004416 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417 break;
4418
4419 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004420 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4421 will result in surrogates in range d800-dfff. Surrogates are
4422 not valid UTF-8 so they are rejected.
4423 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4424 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004425 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004426 (s[2] & 0xc0) != 0x80 ||
4427 ((unsigned char)s[0] == 0xE0 &&
4428 (unsigned char)s[1] < 0xA0) ||
4429 ((unsigned char)s[0] == 0xED &&
4430 (unsigned char)s[1] > 0x9F)) {
4431 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004432 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004433 endinpos = startinpos + 1;
4434
4435 /* if s[1] first two bits are 1 and 0, then the invalid
4436 continuation byte is s[2], so increment endinpos by 1,
4437 if not, s[1] is invalid and endinpos doesn't need to
4438 be incremented. */
4439 if ((s[1] & 0xC0) == 0x80)
4440 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004441 goto utf8Error;
4442 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004444 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004445 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004446 break;
4447
4448 case 4:
4449 if ((s[1] & 0xc0) != 0x80 ||
4450 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004451 (s[3] & 0xc0) != 0x80 ||
4452 ((unsigned char)s[0] == 0xF0 &&
4453 (unsigned char)s[1] < 0x90) ||
4454 ((unsigned char)s[0] == 0xF4 &&
4455 (unsigned char)s[1] > 0x8F)) {
4456 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004457 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004458 endinpos = startinpos + 1;
4459 if ((s[1] & 0xC0) == 0x80) {
4460 endinpos++;
4461 if ((s[2] & 0xC0) == 0x80)
4462 endinpos++;
4463 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 goto utf8Error;
4465 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004466 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004467 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4468 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470 /* If the string is flexible or we have native UCS-4, write
4471 directly.. */
4472 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4473 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004475 else {
4476 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004477
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478 /* translate from 10000..10FFFF to 0..FFFF */
4479 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004480
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004481 /* high surrogate = top 10 bits added to D800 */
4482 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4483 (Py_UNICODE)(0xD800 + (ch >> 10)));
4484
4485 /* low surrogate = bottom 10 bits added to DC00 */
4486 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4487 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4488 }
4489#if SIZEOF_WCHAR_T == 2
4490 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004491#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004493 }
4494 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004495 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004496
Benjamin Peterson29060642009-01-31 22:14:21 +00004497 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004498 /* If this is not yet a resizable string, make it one.. */
4499 if (kind != PyUnicode_WCHAR_KIND) {
4500 const Py_UNICODE *u;
4501 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4502 if (!new_unicode)
4503 goto onError;
4504 u = PyUnicode_AsUnicode((PyObject *)unicode);
4505 if (!u)
4506 goto onError;
4507#if SIZEOF_WCHAR_T == 2
4508 i += wchar_offset;
4509#endif
4510 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4511 Py_DECREF(unicode);
4512 unicode = new_unicode;
4513 kind = 0;
4514 data = PyUnicode_AS_UNICODE(new_unicode);
4515 assert(data != NULL);
4516 }
4517 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004518 if (unicode_decode_call_errorhandler(
4519 errors, &errorHandler,
4520 "utf8", errmsg,
4521 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004522 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004523 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004524 /* Update data because unicode_decode_call_errorhandler might have
4525 re-created or resized the unicode object. */
4526 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004527 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004529 /* Ensure the unicode_size calculation above was correct: */
4530 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4531
Walter Dörwald69652032004-09-07 20:24:22 +00004532 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004533 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004535 /* Adjust length and ready string when it contained errors and
4536 is of the old resizable kind. */
4537 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004538 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004539 goto onError;
4540 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004542 Py_XDECREF(errorHandler);
4543 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004544#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004545 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004546 Py_DECREF(unicode);
4547 return NULL;
4548 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004549#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004550 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551 return (PyObject *)unicode;
4552
Benjamin Peterson29060642009-01-31 22:14:21 +00004553 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004554 Py_XDECREF(errorHandler);
4555 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 Py_DECREF(unicode);
4557 return NULL;
4558}
4559
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004560#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004561
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004562#ifdef __APPLE__
4563
4564/* Simplified UTF-8 decoder using surrogateescape error handler,
4565 used to decode the command line arguments on Mac OS X. */
4566
4567wchar_t*
4568_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4569{
4570 int n;
4571 const char *e;
4572 wchar_t *unicode, *p;
4573
4574 /* Note: size will always be longer than the resulting Unicode
4575 character count */
4576 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4577 PyErr_NoMemory();
4578 return NULL;
4579 }
4580 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4581 if (!unicode)
4582 return NULL;
4583
4584 /* Unpack UTF-8 encoded data */
4585 p = unicode;
4586 e = s + size;
4587 while (s < e) {
4588 Py_UCS4 ch = (unsigned char)*s;
4589
4590 if (ch < 0x80) {
4591 *p++ = (wchar_t)ch;
4592 s++;
4593 continue;
4594 }
4595
4596 n = utf8_code_length[ch];
4597 if (s + n > e) {
4598 goto surrogateescape;
4599 }
4600
4601 switch (n) {
4602 case 0:
4603 case 1:
4604 goto surrogateescape;
4605
4606 case 2:
4607 if ((s[1] & 0xc0) != 0x80)
4608 goto surrogateescape;
4609 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4610 assert ((ch > 0x007F) && (ch <= 0x07FF));
4611 *p++ = (wchar_t)ch;
4612 break;
4613
4614 case 3:
4615 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4616 will result in surrogates in range d800-dfff. Surrogates are
4617 not valid UTF-8 so they are rejected.
4618 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4619 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4620 if ((s[1] & 0xc0) != 0x80 ||
4621 (s[2] & 0xc0) != 0x80 ||
4622 ((unsigned char)s[0] == 0xE0 &&
4623 (unsigned char)s[1] < 0xA0) ||
4624 ((unsigned char)s[0] == 0xED &&
4625 (unsigned char)s[1] > 0x9F)) {
4626
4627 goto surrogateescape;
4628 }
4629 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4630 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004631 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004632 break;
4633
4634 case 4:
4635 if ((s[1] & 0xc0) != 0x80 ||
4636 (s[2] & 0xc0) != 0x80 ||
4637 (s[3] & 0xc0) != 0x80 ||
4638 ((unsigned char)s[0] == 0xF0 &&
4639 (unsigned char)s[1] < 0x90) ||
4640 ((unsigned char)s[0] == 0xF4 &&
4641 (unsigned char)s[1] > 0x8F)) {
4642 goto surrogateescape;
4643 }
4644 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4645 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4646 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4647
4648#if SIZEOF_WCHAR_T == 4
4649 *p++ = (wchar_t)ch;
4650#else
4651 /* compute and append the two surrogates: */
4652
4653 /* translate from 10000..10FFFF to 0..FFFF */
4654 ch -= 0x10000;
4655
4656 /* high surrogate = top 10 bits added to D800 */
4657 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4658
4659 /* low surrogate = bottom 10 bits added to DC00 */
4660 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4661#endif
4662 break;
4663 }
4664 s += n;
4665 continue;
4666
4667 surrogateescape:
4668 *p++ = 0xDC00 + ch;
4669 s++;
4670 }
4671 *p = L'\0';
4672 return unicode;
4673}
4674
4675#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004676
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004677/* Primary internal function which creates utf8 encoded bytes objects.
4678
4679 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004680 and allocate exactly as much space needed at the end. Else allocate the
4681 maximum possible needed (4 result bytes per Unicode character), and return
4682 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004683*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004684PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004685_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686{
Tim Peters602f7402002-04-27 18:03:26 +00004687#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004688
Guido van Rossum98297ee2007-11-06 21:34:58 +00004689 Py_ssize_t i; /* index into s of next input byte */
4690 PyObject *result; /* result string object */
4691 char *p; /* next free byte in output buffer */
4692 Py_ssize_t nallocated; /* number of result bytes allocated */
4693 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004694 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004695 PyObject *errorHandler = NULL;
4696 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004697 int kind;
4698 void *data;
4699 Py_ssize_t size;
4700 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4701#if SIZEOF_WCHAR_T == 2
4702 Py_ssize_t wchar_offset = 0;
4703#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004705 if (!PyUnicode_Check(unicode)) {
4706 PyErr_BadArgument();
4707 return NULL;
4708 }
4709
4710 if (PyUnicode_READY(unicode) == -1)
4711 return NULL;
4712
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004713 if (PyUnicode_UTF8(unicode))
4714 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4715 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004716
4717 kind = PyUnicode_KIND(unicode);
4718 data = PyUnicode_DATA(unicode);
4719 size = PyUnicode_GET_LENGTH(unicode);
4720
Tim Peters602f7402002-04-27 18:03:26 +00004721 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
Tim Peters602f7402002-04-27 18:03:26 +00004723 if (size <= MAX_SHORT_UNICHARS) {
4724 /* Write into the stack buffer; nallocated can't overflow.
4725 * At the end, we'll allocate exactly as much heap space as it
4726 * turns out we need.
4727 */
4728 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004729 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004730 p = stackbuf;
4731 }
4732 else {
4733 /* Overallocate on the heap, and give the excess back at the end. */
4734 nallocated = size * 4;
4735 if (nallocated / 4 != size) /* overflow! */
4736 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004737 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004738 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004739 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004740 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004741 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004742
Tim Peters602f7402002-04-27 18:03:26 +00004743 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004744 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004745
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004746 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004747 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004751 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004752 *p++ = (char)(0xc0 | (ch >> 6));
4753 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004754 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004755 Py_ssize_t newpos;
4756 PyObject *rep;
4757 Py_ssize_t repsize, k, startpos;
4758 startpos = i-1;
4759#if SIZEOF_WCHAR_T == 2
4760 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004761#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004762 rep = unicode_encode_call_errorhandler(
4763 errors, &errorHandler, "utf-8", "surrogates not allowed",
4764 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4765 &exc, startpos, startpos+1, &newpos);
4766 if (!rep)
4767 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004769 if (PyBytes_Check(rep))
4770 repsize = PyBytes_GET_SIZE(rep);
4771 else
4772 repsize = PyUnicode_GET_SIZE(rep);
4773
4774 if (repsize > 4) {
4775 Py_ssize_t offset;
4776
4777 if (result == NULL)
4778 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004779 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004780 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004781
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004782 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4783 /* integer overflow */
4784 PyErr_NoMemory();
4785 goto error;
4786 }
4787 nallocated += repsize - 4;
4788 if (result != NULL) {
4789 if (_PyBytes_Resize(&result, nallocated) < 0)
4790 goto error;
4791 } else {
4792 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004793 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794 goto error;
4795 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4796 }
4797 p = PyBytes_AS_STRING(result) + offset;
4798 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800 if (PyBytes_Check(rep)) {
4801 char *prep = PyBytes_AS_STRING(rep);
4802 for(k = repsize; k > 0; k--)
4803 *p++ = *prep++;
4804 } else /* rep is unicode */ {
4805 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4806 Py_UNICODE c;
4807
4808 for(k=0; k<repsize; k++) {
4809 c = prep[k];
4810 if (0x80 <= c) {
4811 raise_encode_exception(&exc, "utf-8",
4812 PyUnicode_AS_UNICODE(unicode),
4813 size, i-1, i,
4814 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004815 goto error;
4816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004817 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004818 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004819 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004820 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004821 } else if (ch < 0x10000) {
4822 *p++ = (char)(0xe0 | (ch >> 12));
4823 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4824 *p++ = (char)(0x80 | (ch & 0x3f));
4825 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004826 /* Encode UCS4 Unicode ordinals */
4827 *p++ = (char)(0xf0 | (ch >> 18));
4828 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4829 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4830 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004831#if SIZEOF_WCHAR_T == 2
4832 wchar_offset++;
4833#endif
Tim Peters602f7402002-04-27 18:03:26 +00004834 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004836
Guido van Rossum98297ee2007-11-06 21:34:58 +00004837 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004838 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004839 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004840 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004841 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004842 }
4843 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004844 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004845 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004846 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004847 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004850 Py_XDECREF(errorHandler);
4851 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004852 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004853 error:
4854 Py_XDECREF(errorHandler);
4855 Py_XDECREF(exc);
4856 Py_XDECREF(result);
4857 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004858
Tim Peters602f7402002-04-27 18:03:26 +00004859#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860}
4861
Alexander Belopolsky40018472011-02-26 01:02:56 +00004862PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004863PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4864 Py_ssize_t size,
4865 const char *errors)
4866{
4867 PyObject *v, *unicode;
4868
4869 unicode = PyUnicode_FromUnicode(s, size);
4870 if (unicode == NULL)
4871 return NULL;
4872 v = _PyUnicode_AsUTF8String(unicode, errors);
4873 Py_DECREF(unicode);
4874 return v;
4875}
4876
4877PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004878PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004880 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881}
4882
Walter Dörwald41980ca2007-08-16 21:55:45 +00004883/* --- UTF-32 Codec ------------------------------------------------------- */
4884
4885PyObject *
4886PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004887 Py_ssize_t size,
4888 const char *errors,
4889 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004890{
4891 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4892}
4893
4894PyObject *
4895PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004896 Py_ssize_t size,
4897 const char *errors,
4898 int *byteorder,
4899 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004900{
4901 const char *starts = s;
4902 Py_ssize_t startinpos;
4903 Py_ssize_t endinpos;
4904 Py_ssize_t outpos;
4905 PyUnicodeObject *unicode;
4906 Py_UNICODE *p;
4907#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004908 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004909 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910#else
4911 const int pairs = 0;
4912#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004913 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004914 int bo = 0; /* assume native ordering by default */
4915 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004916 /* Offsets from q for retrieving bytes in the right order. */
4917#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4918 int iorder[] = {0, 1, 2, 3};
4919#else
4920 int iorder[] = {3, 2, 1, 0};
4921#endif
4922 PyObject *errorHandler = NULL;
4923 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004924
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925 q = (unsigned char *)s;
4926 e = q + size;
4927
4928 if (byteorder)
4929 bo = *byteorder;
4930
4931 /* Check for BOM marks (U+FEFF) in the input and adjust current
4932 byte order setting accordingly. In native mode, the leading BOM
4933 mark is skipped, in all other modes, it is copied to the output
4934 stream as-is (giving a ZWNBSP character). */
4935 if (bo == 0) {
4936 if (size >= 4) {
4937 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004938 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004939#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004940 if (bom == 0x0000FEFF) {
4941 q += 4;
4942 bo = -1;
4943 }
4944 else if (bom == 0xFFFE0000) {
4945 q += 4;
4946 bo = 1;
4947 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004949 if (bom == 0x0000FEFF) {
4950 q += 4;
4951 bo = 1;
4952 }
4953 else if (bom == 0xFFFE0000) {
4954 q += 4;
4955 bo = -1;
4956 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004958 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004959 }
4960
4961 if (bo == -1) {
4962 /* force LE */
4963 iorder[0] = 0;
4964 iorder[1] = 1;
4965 iorder[2] = 2;
4966 iorder[3] = 3;
4967 }
4968 else if (bo == 1) {
4969 /* force BE */
4970 iorder[0] = 3;
4971 iorder[1] = 2;
4972 iorder[2] = 1;
4973 iorder[3] = 0;
4974 }
4975
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004976 /* On narrow builds we split characters outside the BMP into two
4977 codepoints => count how much extra space we need. */
4978#ifndef Py_UNICODE_WIDE
4979 for (qq = q; qq < e; qq += 4)
4980 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4981 pairs++;
4982#endif
4983
4984 /* This might be one to much, because of a BOM */
4985 unicode = _PyUnicode_New((size+3)/4+pairs);
4986 if (!unicode)
4987 return NULL;
4988 if (size == 0)
4989 return (PyObject *)unicode;
4990
4991 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004992 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004993
Walter Dörwald41980ca2007-08-16 21:55:45 +00004994 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004995 Py_UCS4 ch;
4996 /* remaining bytes at the end? (size should be divisible by 4) */
4997 if (e-q<4) {
4998 if (consumed)
4999 break;
5000 errmsg = "truncated data";
5001 startinpos = ((const char *)q)-starts;
5002 endinpos = ((const char *)e)-starts;
5003 goto utf32Error;
5004 /* The remaining input chars are ignored if the callback
5005 chooses to skip the input */
5006 }
5007 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5008 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005009
Benjamin Peterson29060642009-01-31 22:14:21 +00005010 if (ch >= 0x110000)
5011 {
5012 errmsg = "codepoint not in range(0x110000)";
5013 startinpos = ((const char *)q)-starts;
5014 endinpos = startinpos+4;
5015 goto utf32Error;
5016 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005017#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 if (ch >= 0x10000)
5019 {
5020 *p++ = 0xD800 | ((ch-0x10000) >> 10);
5021 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
5022 }
5023 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00005024#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 *p++ = ch;
5026 q += 4;
5027 continue;
5028 utf32Error:
5029 outpos = p-PyUnicode_AS_UNICODE(unicode);
5030 if (unicode_decode_call_errorhandler(
5031 errors, &errorHandler,
5032 "utf32", errmsg,
5033 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5034 &unicode, &outpos, &p))
5035 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005036 }
5037
5038 if (byteorder)
5039 *byteorder = bo;
5040
5041 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043
5044 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005045 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005046 goto onError;
5047
5048 Py_XDECREF(errorHandler);
5049 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005050#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005051 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005052 Py_DECREF(unicode);
5053 return NULL;
5054 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005055#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005056 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 return (PyObject *)unicode;
5058
Benjamin Peterson29060642009-01-31 22:14:21 +00005059 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060 Py_DECREF(unicode);
5061 Py_XDECREF(errorHandler);
5062 Py_XDECREF(exc);
5063 return NULL;
5064}
5065
5066PyObject *
5067PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005068 Py_ssize_t size,
5069 const char *errors,
5070 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005072 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005073 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005074 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005076 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005077#else
5078 const int pairs = 0;
5079#endif
5080 /* Offsets from p for storing byte pairs in the right order. */
5081#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5082 int iorder[] = {0, 1, 2, 3};
5083#else
5084 int iorder[] = {3, 2, 1, 0};
5085#endif
5086
Benjamin Peterson29060642009-01-31 22:14:21 +00005087#define STORECHAR(CH) \
5088 do { \
5089 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5090 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5091 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5092 p[iorder[0]] = (CH) & 0xff; \
5093 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005094 } while(0)
5095
5096 /* In narrow builds we can output surrogate pairs as one codepoint,
5097 so we need less space. */
5098#ifndef Py_UNICODE_WIDE
5099 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005100 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
5101 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
5102 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005103#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005104 nsize = (size - pairs + (byteorder == 0));
5105 bytesize = nsize * 4;
5106 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005107 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005108 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005109 if (v == NULL)
5110 return NULL;
5111
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005112 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005114 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005115 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005116 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117
5118 if (byteorder == -1) {
5119 /* force LE */
5120 iorder[0] = 0;
5121 iorder[1] = 1;
5122 iorder[2] = 2;
5123 iorder[3] = 3;
5124 }
5125 else if (byteorder == 1) {
5126 /* force BE */
5127 iorder[0] = 3;
5128 iorder[1] = 2;
5129 iorder[2] = 1;
5130 iorder[3] = 0;
5131 }
5132
5133 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005134 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005135#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005136 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
5137 Py_UCS4 ch2 = *s;
5138 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5139 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5140 s++;
5141 size--;
5142 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00005143 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00005144#endif
5145 STORECHAR(ch);
5146 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005147
5148 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005149 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005150#undef STORECHAR
5151}
5152
Alexander Belopolsky40018472011-02-26 01:02:56 +00005153PyObject *
5154PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005155{
5156 if (!PyUnicode_Check(unicode)) {
5157 PyErr_BadArgument();
5158 return NULL;
5159 }
5160 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005161 PyUnicode_GET_SIZE(unicode),
5162 NULL,
5163 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005164}
5165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166/* --- UTF-16 Codec ------------------------------------------------------- */
5167
Tim Peters772747b2001-08-09 22:21:55 +00005168PyObject *
5169PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005170 Py_ssize_t size,
5171 const char *errors,
5172 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Walter Dörwald69652032004-09-07 20:24:22 +00005174 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5175}
5176
Antoine Pitrouab868312009-01-10 15:40:25 +00005177/* Two masks for fast checking of whether a C 'long' may contain
5178 UTF16-encoded surrogate characters. This is an efficient heuristic,
5179 assuming that non-surrogate characters with a code point >= 0x8000 are
5180 rare in most input.
5181 FAST_CHAR_MASK is used when the input is in native byte ordering,
5182 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005183*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005184#if (SIZEOF_LONG == 8)
5185# define FAST_CHAR_MASK 0x8000800080008000L
5186# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5187#elif (SIZEOF_LONG == 4)
5188# define FAST_CHAR_MASK 0x80008000L
5189# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5190#else
5191# error C 'long' size should be either 4 or 8!
5192#endif
5193
Walter Dörwald69652032004-09-07 20:24:22 +00005194PyObject *
5195PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005196 Py_ssize_t size,
5197 const char *errors,
5198 int *byteorder,
5199 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005200{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005201 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005202 Py_ssize_t startinpos;
5203 Py_ssize_t endinpos;
5204 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 PyUnicodeObject *unicode;
5206 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005207 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005208 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005209 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005210 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005211 /* Offsets from q for retrieving byte pairs in the right order. */
5212#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5213 int ihi = 1, ilo = 0;
5214#else
5215 int ihi = 0, ilo = 1;
5216#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 PyObject *errorHandler = NULL;
5218 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
5220 /* Note: size will always be longer than the resulting Unicode
5221 character count */
5222 unicode = _PyUnicode_New(size);
5223 if (!unicode)
5224 return NULL;
5225 if (size == 0)
5226 return (PyObject *)unicode;
5227
5228 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005229 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005230 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005231 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005232
5233 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005234 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005236 /* Check for BOM marks (U+FEFF) in the input and adjust current
5237 byte order setting accordingly. In native mode, the leading BOM
5238 mark is skipped, in all other modes, it is copied to the output
5239 stream as-is (giving a ZWNBSP character). */
5240 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005241 if (size >= 2) {
5242 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005243#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005244 if (bom == 0xFEFF) {
5245 q += 2;
5246 bo = -1;
5247 }
5248 else if (bom == 0xFFFE) {
5249 q += 2;
5250 bo = 1;
5251 }
Tim Petersced69f82003-09-16 20:30:58 +00005252#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 if (bom == 0xFEFF) {
5254 q += 2;
5255 bo = 1;
5256 }
5257 else if (bom == 0xFFFE) {
5258 q += 2;
5259 bo = -1;
5260 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005261#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005262 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264
Tim Peters772747b2001-08-09 22:21:55 +00005265 if (bo == -1) {
5266 /* force LE */
5267 ihi = 1;
5268 ilo = 0;
5269 }
5270 else if (bo == 1) {
5271 /* force BE */
5272 ihi = 0;
5273 ilo = 1;
5274 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005275#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5276 native_ordering = ilo < ihi;
5277#else
5278 native_ordering = ilo > ihi;
5279#endif
Tim Peters772747b2001-08-09 22:21:55 +00005280
Antoine Pitrouab868312009-01-10 15:40:25 +00005281 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005282 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005283 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005284 /* First check for possible aligned read of a C 'long'. Unaligned
5285 reads are more expensive, better to defer to another iteration. */
5286 if (!((size_t) q & LONG_PTR_MASK)) {
5287 /* Fast path for runs of non-surrogate chars. */
5288 register const unsigned char *_q = q;
5289 Py_UNICODE *_p = p;
5290 if (native_ordering) {
5291 /* Native ordering is simple: as long as the input cannot
5292 possibly contain a surrogate char, do an unrolled copy
5293 of several 16-bit code points to the target object.
5294 The non-surrogate check is done on several input bytes
5295 at a time (as many as a C 'long' can contain). */
5296 while (_q < aligned_end) {
5297 unsigned long data = * (unsigned long *) _q;
5298 if (data & FAST_CHAR_MASK)
5299 break;
5300 _p[0] = ((unsigned short *) _q)[0];
5301 _p[1] = ((unsigned short *) _q)[1];
5302#if (SIZEOF_LONG == 8)
5303 _p[2] = ((unsigned short *) _q)[2];
5304 _p[3] = ((unsigned short *) _q)[3];
5305#endif
5306 _q += SIZEOF_LONG;
5307 _p += SIZEOF_LONG / 2;
5308 }
5309 }
5310 else {
5311 /* Byteswapped ordering is similar, but we must decompose
5312 the copy bytewise, and take care of zero'ing out the
5313 upper bytes if the target object is in 32-bit units
5314 (that is, in UCS-4 builds). */
5315 while (_q < aligned_end) {
5316 unsigned long data = * (unsigned long *) _q;
5317 if (data & SWAPPED_FAST_CHAR_MASK)
5318 break;
5319 /* Zero upper bytes in UCS-4 builds */
5320#if (Py_UNICODE_SIZE > 2)
5321 _p[0] = 0;
5322 _p[1] = 0;
5323#if (SIZEOF_LONG == 8)
5324 _p[2] = 0;
5325 _p[3] = 0;
5326#endif
5327#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005328 /* Issue #4916; UCS-4 builds on big endian machines must
5329 fill the two last bytes of each 4-byte unit. */
5330#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5331# define OFF 2
5332#else
5333# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005334#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005335 ((unsigned char *) _p)[OFF + 1] = _q[0];
5336 ((unsigned char *) _p)[OFF + 0] = _q[1];
5337 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5338 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5339#if (SIZEOF_LONG == 8)
5340 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5341 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5342 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5343 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5344#endif
5345#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005346 _q += SIZEOF_LONG;
5347 _p += SIZEOF_LONG / 2;
5348 }
5349 }
5350 p = _p;
5351 q = _q;
5352 if (q >= e)
5353 break;
5354 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005355 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356
Benjamin Peterson14339b62009-01-31 16:36:08 +00005357 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005358
5359 if (ch < 0xD800 || ch > 0xDFFF) {
5360 *p++ = ch;
5361 continue;
5362 }
5363
5364 /* UTF-16 code pair: */
5365 if (q > e) {
5366 errmsg = "unexpected end of data";
5367 startinpos = (((const char *)q) - 2) - starts;
5368 endinpos = ((const char *)e) + 1 - starts;
5369 goto utf16Error;
5370 }
5371 if (0xD800 <= ch && ch <= 0xDBFF) {
5372 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5373 q += 2;
5374 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005375#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 *p++ = ch;
5377 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005378#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005379 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005380#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005381 continue;
5382 }
5383 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005384 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005385 startinpos = (((const char *)q)-4)-starts;
5386 endinpos = startinpos+2;
5387 goto utf16Error;
5388 }
5389
Benjamin Peterson14339b62009-01-31 16:36:08 +00005390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005391 errmsg = "illegal encoding";
5392 startinpos = (((const char *)q)-2)-starts;
5393 endinpos = startinpos+2;
5394 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005395
Benjamin Peterson29060642009-01-31 22:14:21 +00005396 utf16Error:
5397 outpos = p - PyUnicode_AS_UNICODE(unicode);
5398 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005399 errors,
5400 &errorHandler,
5401 "utf16", errmsg,
5402 &starts,
5403 (const char **)&e,
5404 &startinpos,
5405 &endinpos,
5406 &exc,
5407 (const char **)&q,
5408 &unicode,
5409 &outpos,
5410 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005411 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005413 /* remaining byte at the end? (size should be even) */
5414 if (e == q) {
5415 if (!consumed) {
5416 errmsg = "truncated data";
5417 startinpos = ((const char *)q) - starts;
5418 endinpos = ((const char *)e) + 1 - starts;
5419 outpos = p - PyUnicode_AS_UNICODE(unicode);
5420 if (unicode_decode_call_errorhandler(
5421 errors,
5422 &errorHandler,
5423 "utf16", errmsg,
5424 &starts,
5425 (const char **)&e,
5426 &startinpos,
5427 &endinpos,
5428 &exc,
5429 (const char **)&q,
5430 &unicode,
5431 &outpos,
5432 &p))
5433 goto onError;
5434 /* The remaining input chars are ignored if the callback
5435 chooses to skip the input */
5436 }
5437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
5439 if (byteorder)
5440 *byteorder = bo;
5441
Walter Dörwald69652032004-09-07 20:24:22 +00005442 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005443 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005444
Guido van Rossumd57fd912000-03-10 22:53:23 +00005445 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005446 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005447 goto onError;
5448
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005449 Py_XDECREF(errorHandler);
5450 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005451#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005452 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005453 Py_DECREF(unicode);
5454 return NULL;
5455 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005456#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005457 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005458 return (PyObject *)unicode;
5459
Benjamin Peterson29060642009-01-31 22:14:21 +00005460 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005461 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005462 Py_XDECREF(errorHandler);
5463 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464 return NULL;
5465}
5466
Antoine Pitrouab868312009-01-10 15:40:25 +00005467#undef FAST_CHAR_MASK
5468#undef SWAPPED_FAST_CHAR_MASK
5469
Tim Peters772747b2001-08-09 22:21:55 +00005470PyObject *
5471PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 Py_ssize_t size,
5473 const char *errors,
5474 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005476 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005477 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005478 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005479#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005480 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005481#else
5482 const int pairs = 0;
5483#endif
Tim Peters772747b2001-08-09 22:21:55 +00005484 /* Offsets from p for storing byte pairs in the right order. */
5485#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5486 int ihi = 1, ilo = 0;
5487#else
5488 int ihi = 0, ilo = 1;
5489#endif
5490
Benjamin Peterson29060642009-01-31 22:14:21 +00005491#define STORECHAR(CH) \
5492 do { \
5493 p[ihi] = ((CH) >> 8) & 0xff; \
5494 p[ilo] = (CH) & 0xff; \
5495 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005496 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005498#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005499 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 if (s[i] >= 0x10000)
5501 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005502#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005503 /* 2 * (size + pairs + (byteorder == 0)) */
5504 if (size > PY_SSIZE_T_MAX ||
5505 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005506 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005507 nsize = size + pairs + (byteorder == 0);
5508 bytesize = nsize * 2;
5509 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005511 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005512 if (v == NULL)
5513 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005515 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005518 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005519 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005520
5521 if (byteorder == -1) {
5522 /* force LE */
5523 ihi = 1;
5524 ilo = 0;
5525 }
5526 else if (byteorder == 1) {
5527 /* force BE */
5528 ihi = 0;
5529 ilo = 1;
5530 }
5531
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005532 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005533 Py_UNICODE ch = *s++;
5534 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005535#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 if (ch >= 0x10000) {
5537 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5538 ch = 0xD800 | ((ch-0x10000) >> 10);
5539 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005540#endif
Tim Peters772747b2001-08-09 22:21:55 +00005541 STORECHAR(ch);
5542 if (ch2)
5543 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005544 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005545
5546 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005547 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005548#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549}
5550
Alexander Belopolsky40018472011-02-26 01:02:56 +00005551PyObject *
5552PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553{
5554 if (!PyUnicode_Check(unicode)) {
5555 PyErr_BadArgument();
5556 return NULL;
5557 }
5558 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005559 PyUnicode_GET_SIZE(unicode),
5560 NULL,
5561 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562}
5563
5564/* --- Unicode Escape Codec ----------------------------------------------- */
5565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5567 if all the escapes in the string make it still a valid ASCII string.
5568 Returns -1 if any escapes were found which cause the string to
5569 pop out of ASCII range. Otherwise returns the length of the
5570 required buffer to hold the string.
5571 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005572static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5574{
5575 const unsigned char *p = (const unsigned char *)s;
5576 const unsigned char *end = p + size;
5577 Py_ssize_t length = 0;
5578
5579 if (size < 0)
5580 return -1;
5581
5582 for (; p < end; ++p) {
5583 if (*p > 127) {
5584 /* Non-ASCII */
5585 return -1;
5586 }
5587 else if (*p != '\\') {
5588 /* Normal character */
5589 ++length;
5590 }
5591 else {
5592 /* Backslash-escape, check next char */
5593 ++p;
5594 /* Escape sequence reaches till end of string or
5595 non-ASCII follow-up. */
5596 if (p >= end || *p > 127)
5597 return -1;
5598 switch (*p) {
5599 case '\n':
5600 /* backslash + \n result in zero characters */
5601 break;
5602 case '\\': case '\'': case '\"':
5603 case 'b': case 'f': case 't':
5604 case 'n': case 'r': case 'v': case 'a':
5605 ++length;
5606 break;
5607 case '0': case '1': case '2': case '3':
5608 case '4': case '5': case '6': case '7':
5609 case 'x': case 'u': case 'U': case 'N':
5610 /* these do not guarantee ASCII characters */
5611 return -1;
5612 default:
5613 /* count the backslash + the other character */
5614 length += 2;
5615 }
5616 }
5617 }
5618 return length;
5619}
5620
5621/* Similar to PyUnicode_WRITE but either write into wstr field
5622 or treat string as ASCII. */
5623#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5624 do { \
5625 if ((kind) != PyUnicode_WCHAR_KIND) \
5626 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5627 else \
5628 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5629 } while (0)
5630
5631#define WRITE_WSTR(buf, index, value) \
5632 assert(kind == PyUnicode_WCHAR_KIND), \
5633 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5634
5635
Fredrik Lundh06d12682001-01-24 07:59:11 +00005636static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005637
Alexander Belopolsky40018472011-02-26 01:02:56 +00005638PyObject *
5639PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005640 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005641 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005643 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005644 Py_ssize_t startinpos;
5645 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005647 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005648 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005649 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005650 char* message;
5651 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005652 PyObject *errorHandler = NULL;
5653 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005654 Py_ssize_t ascii_length;
5655 Py_ssize_t i;
5656 int kind;
5657 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 ascii_length = length_of_escaped_ascii_string(s, size);
5660
5661 /* After length_of_escaped_ascii_string() there are two alternatives,
5662 either the string is pure ASCII with named escapes like \n, etc.
5663 and we determined it's exact size (common case)
5664 or it contains \x, \u, ... escape sequences. then we create a
5665 legacy wchar string and resize it at the end of this function. */
5666 if (ascii_length >= 0) {
5667 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5668 if (!v)
5669 goto onError;
5670 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5671 kind = PyUnicode_1BYTE_KIND;
5672 data = PyUnicode_DATA(v);
5673 }
5674 else {
5675 /* Escaped strings will always be longer than the resulting
5676 Unicode string, so we start with size here and then reduce the
5677 length after conversion to the true value.
5678 (but if the error callback returns a long replacement string
5679 we'll have to allocate more space) */
5680 v = _PyUnicode_New(size);
5681 if (!v)
5682 goto onError;
5683 kind = PyUnicode_WCHAR_KIND;
5684 data = PyUnicode_AS_UNICODE(v);
5685 }
5686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687 if (size == 0)
5688 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005690 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 while (s < end) {
5693 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005694 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005695 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005697 if (kind == PyUnicode_WCHAR_KIND) {
5698 assert(i < _PyUnicode_WSTR_LENGTH(v));
5699 }
5700 else {
5701 /* The only case in which i == ascii_length is a backslash
5702 followed by a newline. */
5703 assert(i <= ascii_length);
5704 }
5705
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706 /* Non-escape characters are interpreted as Unicode ordinals */
5707 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005708 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 continue;
5710 }
5711
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 /* \ - Escapes */
5714 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005715 c = *s++;
5716 if (s > end)
5717 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005718
5719 if (kind == PyUnicode_WCHAR_KIND) {
5720 assert(i < _PyUnicode_WSTR_LENGTH(v));
5721 }
5722 else {
5723 /* The only case in which i == ascii_length is a backslash
5724 followed by a newline. */
5725 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5726 }
5727
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005728 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Benjamin Peterson29060642009-01-31 22:14:21 +00005730 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005731 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5733 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5734 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5735 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5736 /* FF */
5737 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5738 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5739 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5740 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5741 /* VT */
5742 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5743 /* BEL, not classic C */
5744 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745
Benjamin Peterson29060642009-01-31 22:14:21 +00005746 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747 case '0': case '1': case '2': case '3':
5748 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005749 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005750 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005751 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005752 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005753 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005755 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 break;
5757
Benjamin Peterson29060642009-01-31 22:14:21 +00005758 /* hex escapes */
5759 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005760 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005761 digits = 2;
5762 message = "truncated \\xXX escape";
5763 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764
Benjamin Peterson29060642009-01-31 22:14:21 +00005765 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005766 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005767 digits = 4;
5768 message = "truncated \\uXXXX escape";
5769 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770
Benjamin Peterson29060642009-01-31 22:14:21 +00005771 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005773 digits = 8;
5774 message = "truncated \\UXXXXXXXX escape";
5775 hexescape:
5776 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005777 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005778 if (s+digits>end) {
5779 endinpos = size;
5780 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005781 errors, &errorHandler,
5782 "unicodeescape", "end of string in escape sequence",
5783 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005784 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005785 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005786 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005787 goto nextByte;
5788 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005789 for (j = 0; j < digits; ++j) {
5790 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005791 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005792 endinpos = (s+j+1)-starts;
5793 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005795 errors, &errorHandler,
5796 "unicodeescape", message,
5797 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005798 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005799 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005800 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005802 }
5803 chr = (chr<<4) & ~0xF;
5804 if (c >= '0' && c <= '9')
5805 chr += c - '0';
5806 else if (c >= 'a' && c <= 'f')
5807 chr += 10 + c - 'a';
5808 else
5809 chr += 10 + c - 'A';
5810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005811 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005812 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005813 /* _decoding_error will have already written into the
5814 target buffer. */
5815 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005817 /* when we get here, chr is a 32-bit unicode character */
5818 if (chr <= 0xffff)
5819 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005820 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005821 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005822 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005823 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005824#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005825 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005826#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005827 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005828 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5829 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005830#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005831 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005832 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005833 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 errors, &errorHandler,
5836 "unicodeescape", "illegal Unicode character",
5837 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005838 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005839 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005840 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005841 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005842 break;
5843
Benjamin Peterson29060642009-01-31 22:14:21 +00005844 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005845 case 'N':
5846 message = "malformed \\N character escape";
5847 if (ucnhash_CAPI == NULL) {
5848 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005849 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5850 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005851 if (ucnhash_CAPI == NULL)
5852 goto ucnhashError;
5853 }
5854 if (*s == '{') {
5855 const char *start = s+1;
5856 /* look for the closing brace */
5857 while (*s != '}' && s < end)
5858 s++;
5859 if (s > start && s < end && *s == '}') {
5860 /* found a name. look it up in the unicode database */
5861 message = "unknown Unicode character name";
5862 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005863 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005864 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005865 goto store;
5866 }
5867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005868 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005869 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005870 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005871 errors, &errorHandler,
5872 "unicodeescape", message,
5873 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005874 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005875 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005876 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005877 break;
5878
5879 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005880 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005881 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005882 message = "\\ at end of string";
5883 s--;
5884 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005885 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005886 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005887 errors, &errorHandler,
5888 "unicodeescape", message,
5889 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005890 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005891 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005892 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005893 }
5894 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005895 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5896 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005897 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005898 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005900 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005901 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005903 /* Ensure the length prediction worked in case of ASCII strings */
5904 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5905
Victor Stinnerfe226c02011-10-03 03:52:20 +02005906 if (kind == PyUnicode_WCHAR_KIND)
5907 {
5908 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5909 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005910 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005911 Py_XDECREF(errorHandler);
5912 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005913#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005914 if (_PyUnicode_READY_REPLACE(&v)) {
5915 Py_DECREF(v);
5916 return NULL;
5917 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005918#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005919 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005921
Benjamin Peterson29060642009-01-31 22:14:21 +00005922 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005923 PyErr_SetString(
5924 PyExc_UnicodeError,
5925 "\\N escapes not supported (can't load unicodedata module)"
5926 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005927 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005928 Py_XDECREF(errorHandler);
5929 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005930 return NULL;
5931
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 Py_XDECREF(errorHandler);
5935 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936 return NULL;
5937}
5938
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005939#undef WRITE_ASCII_OR_WSTR
5940#undef WRITE_WSTR
5941
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942/* Return a Unicode-Escape string version of the Unicode object.
5943
5944 If quotes is true, the string is enclosed in u"" or u'' quotes as
5945 appropriate.
5946
5947*/
5948
Alexander Belopolsky40018472011-02-26 01:02:56 +00005949PyObject *
5950PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005951 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005956#ifdef Py_UNICODE_WIDE
5957 const Py_ssize_t expandsize = 10;
5958#else
5959 const Py_ssize_t expandsize = 6;
5960#endif
5961
Thomas Wouters89f507f2006-12-13 04:49:30 +00005962 /* XXX(nnorwitz): rather than over-allocating, it would be
5963 better to choose a different scheme. Perhaps scan the
5964 first N-chars of the string and allocate based on that size.
5965 */
5966 /* Initial allocation is based on the longest-possible unichr
5967 escape.
5968
5969 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5970 unichr, so in this case it's the longest unichr escape. In
5971 narrow (UTF-16) builds this is five chars per source unichr
5972 since there are two unichrs in the surrogate pair, so in narrow
5973 (UTF-16) builds it's not the longest unichr escape.
5974
5975 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5976 so in the narrow (UTF-16) build case it's the longest unichr
5977 escape.
5978 */
5979
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005980 if (size == 0)
5981 return PyBytes_FromStringAndSize(NULL, 0);
5982
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005983 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005985
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005986 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 2
5988 + expandsize*size
5989 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 if (repr == NULL)
5991 return NULL;
5992
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005993 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995 while (size-- > 0) {
5996 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005997
Walter Dörwald79e913e2007-05-12 11:08:06 +00005998 /* Escape backslashes */
5999 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 *p++ = '\\';
6001 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00006002 continue;
Tim Petersced69f82003-09-16 20:30:58 +00006003 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006004
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00006005#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006006 /* Map 21-bit characters to '\U00xxxxxx' */
6007 else if (ch >= 0x10000) {
6008 *p++ = '\\';
6009 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006010 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
6011 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
6012 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6013 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6014 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6015 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6016 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6017 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006019 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006020#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006021 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6022 else if (ch >= 0xD800 && ch < 0xDC00) {
6023 Py_UNICODE ch2;
6024 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00006025
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 ch2 = *s++;
6027 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006028 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6030 *p++ = '\\';
6031 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006032 *p++ = Py_hexdigits[(ucs >> 28) & 0x0000000F];
6033 *p++ = Py_hexdigits[(ucs >> 24) & 0x0000000F];
6034 *p++ = Py_hexdigits[(ucs >> 20) & 0x0000000F];
6035 *p++ = Py_hexdigits[(ucs >> 16) & 0x0000000F];
6036 *p++ = Py_hexdigits[(ucs >> 12) & 0x0000000F];
6037 *p++ = Py_hexdigits[(ucs >> 8) & 0x0000000F];
6038 *p++ = Py_hexdigits[(ucs >> 4) & 0x0000000F];
6039 *p++ = Py_hexdigits[ucs & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 continue;
6041 }
6042 /* Fall through: isolated surrogates are copied as-is */
6043 s--;
6044 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006045 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00006046#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00006049 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 *p++ = '\\';
6051 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006052 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6053 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6054 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6055 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006057
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006058 /* Map special whitespace to '\t', \n', '\r' */
6059 else if (ch == '\t') {
6060 *p++ = '\\';
6061 *p++ = 't';
6062 }
6063 else if (ch == '\n') {
6064 *p++ = '\\';
6065 *p++ = 'n';
6066 }
6067 else if (ch == '\r') {
6068 *p++ = '\\';
6069 *p++ = 'r';
6070 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006071
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006072 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00006073 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006074 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006075 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006076 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6077 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00006078 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 /* Copy everything else as-is */
6081 else
6082 *p++ = (char) ch;
6083 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006085 assert(p - PyBytes_AS_STRING(repr) > 0);
6086 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6087 return NULL;
6088 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006089}
6090
Alexander Belopolsky40018472011-02-26 01:02:56 +00006091PyObject *
6092PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006093{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006094 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 if (!PyUnicode_Check(unicode)) {
6096 PyErr_BadArgument();
6097 return NULL;
6098 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00006099 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6100 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006101 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102}
6103
6104/* --- Raw Unicode Escape Codec ------------------------------------------- */
6105
Alexander Belopolsky40018472011-02-26 01:02:56 +00006106PyObject *
6107PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006108 Py_ssize_t size,
6109 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006111 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006112 Py_ssize_t startinpos;
6113 Py_ssize_t endinpos;
6114 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006115 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006116 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 const char *end;
6118 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006119 PyObject *errorHandler = NULL;
6120 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006121
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 /* Escaped strings will always be longer than the resulting
6123 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006124 length after conversion to the true value. (But decoding error
6125 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 v = _PyUnicode_New(size);
6127 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006129 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006130 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006131 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132 end = s + size;
6133 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 unsigned char c;
6135 Py_UCS4 x;
6136 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006137 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006138
Benjamin Peterson29060642009-01-31 22:14:21 +00006139 /* Non-escape characters are interpreted as Unicode ordinals */
6140 if (*s != '\\') {
6141 *p++ = (unsigned char)*s++;
6142 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006143 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 startinpos = s-starts;
6145
6146 /* \u-escapes are only interpreted iff the number of leading
6147 backslashes if odd */
6148 bs = s;
6149 for (;s < end;) {
6150 if (*s != '\\')
6151 break;
6152 *p++ = (unsigned char)*s++;
6153 }
6154 if (((s - bs) & 1) == 0 ||
6155 s >= end ||
6156 (*s != 'u' && *s != 'U')) {
6157 continue;
6158 }
6159 p--;
6160 count = *s=='u' ? 4 : 8;
6161 s++;
6162
6163 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6164 outpos = p-PyUnicode_AS_UNICODE(v);
6165 for (x = 0, i = 0; i < count; ++i, ++s) {
6166 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006167 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 endinpos = s-starts;
6169 if (unicode_decode_call_errorhandler(
6170 errors, &errorHandler,
6171 "rawunicodeescape", "truncated \\uXXXX",
6172 &starts, &end, &startinpos, &endinpos, &exc, &s,
6173 &v, &outpos, &p))
6174 goto onError;
6175 goto nextByte;
6176 }
6177 x = (x<<4) & ~0xF;
6178 if (c >= '0' && c <= '9')
6179 x += c - '0';
6180 else if (c >= 'a' && c <= 'f')
6181 x += 10 + c - 'a';
6182 else
6183 x += 10 + c - 'A';
6184 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00006185 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00006186 /* UCS-2 character */
6187 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006188 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006189 /* UCS-4 character. Either store directly, or as
6190 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00006191#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006192 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006193#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006194 x -= 0x10000L;
6195 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
6196 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00006197#endif
6198 } else {
6199 endinpos = s-starts;
6200 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006201 if (unicode_decode_call_errorhandler(
6202 errors, &errorHandler,
6203 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 &starts, &end, &startinpos, &endinpos, &exc, &s,
6205 &v, &outpos, &p))
6206 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006207 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006208 nextByte:
6209 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006211 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006213 Py_XDECREF(errorHandler);
6214 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006215#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006216 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006217 Py_DECREF(v);
6218 return NULL;
6219 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006220#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006221 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006223
Benjamin Peterson29060642009-01-31 22:14:21 +00006224 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006226 Py_XDECREF(errorHandler);
6227 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 return NULL;
6229}
6230
Alexander Belopolsky40018472011-02-26 01:02:56 +00006231PyObject *
6232PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006233 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006235 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 char *p;
6237 char *q;
6238
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006239#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006240 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006241#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006242 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006243#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006244
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006245 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006247
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006248 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249 if (repr == NULL)
6250 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006251 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006252 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006254 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006255 while (size-- > 0) {
6256 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006257#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006258 /* Map 32-bit characters to '\Uxxxxxxxx' */
6259 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006260 *p++ = '\\';
6261 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006262 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6263 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6264 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6265 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6266 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6267 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6268 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6269 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006270 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006271 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006272#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6274 if (ch >= 0xD800 && ch < 0xDC00) {
6275 Py_UNICODE ch2;
6276 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006277
Benjamin Peterson29060642009-01-31 22:14:21 +00006278 ch2 = *s++;
6279 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006280 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6282 *p++ = '\\';
6283 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006284 *p++ = Py_hexdigits[(ucs >> 28) & 0xf];
6285 *p++ = Py_hexdigits[(ucs >> 24) & 0xf];
6286 *p++ = Py_hexdigits[(ucs >> 20) & 0xf];
6287 *p++ = Py_hexdigits[(ucs >> 16) & 0xf];
6288 *p++ = Py_hexdigits[(ucs >> 12) & 0xf];
6289 *p++ = Py_hexdigits[(ucs >> 8) & 0xf];
6290 *p++ = Py_hexdigits[(ucs >> 4) & 0xf];
6291 *p++ = Py_hexdigits[ucs & 0xf];
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 continue;
6293 }
6294 /* Fall through: isolated surrogates are copied as-is */
6295 s--;
6296 size++;
6297 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006298#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 /* Map 16-bit characters to '\uxxxx' */
6300 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301 *p++ = '\\';
6302 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006303 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6304 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6305 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6306 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006308 /* Copy everything else as-is */
6309 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310 *p++ = (char) ch;
6311 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006312 size = p - q;
6313
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006314 assert(size > 0);
6315 if (_PyBytes_Resize(&repr, size) < 0)
6316 return NULL;
6317 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318}
6319
Alexander Belopolsky40018472011-02-26 01:02:56 +00006320PyObject *
6321PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006323 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006324 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006325 PyErr_BadArgument();
6326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006328 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6329 PyUnicode_GET_SIZE(unicode));
6330
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006331 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332}
6333
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006334/* --- Unicode Internal Codec ------------------------------------------- */
6335
Alexander Belopolsky40018472011-02-26 01:02:56 +00006336PyObject *
6337_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006338 Py_ssize_t size,
6339 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006340{
6341 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006342 Py_ssize_t startinpos;
6343 Py_ssize_t endinpos;
6344 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006345 PyUnicodeObject *v;
6346 Py_UNICODE *p;
6347 const char *end;
6348 const char *reason;
6349 PyObject *errorHandler = NULL;
6350 PyObject *exc = NULL;
6351
Neal Norwitzd43069c2006-01-08 01:12:10 +00006352#ifdef Py_UNICODE_WIDE
6353 Py_UNICODE unimax = PyUnicode_GetMax();
6354#endif
6355
Thomas Wouters89f507f2006-12-13 04:49:30 +00006356 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006357 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6358 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006359 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006360 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6361 as string was created with the old API. */
6362 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006364 p = PyUnicode_AS_UNICODE(v);
6365 end = s + size;
6366
6367 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006368 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006369 /* We have to sanity check the raw data, otherwise doom looms for
6370 some malformed UCS-4 data. */
6371 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006372#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006373 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006374#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006375 end-s < Py_UNICODE_SIZE
6376 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006377 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006378 startinpos = s - starts;
6379 if (end-s < Py_UNICODE_SIZE) {
6380 endinpos = end-starts;
6381 reason = "truncated input";
6382 }
6383 else {
6384 endinpos = s - starts + Py_UNICODE_SIZE;
6385 reason = "illegal code point (> 0x10FFFF)";
6386 }
6387 outpos = p - PyUnicode_AS_UNICODE(v);
6388 if (unicode_decode_call_errorhandler(
6389 errors, &errorHandler,
6390 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006391 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006392 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006393 goto onError;
6394 }
6395 }
6396 else {
6397 p++;
6398 s += Py_UNICODE_SIZE;
6399 }
6400 }
6401
Victor Stinnerfe226c02011-10-03 03:52:20 +02006402 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006403 goto onError;
6404 Py_XDECREF(errorHandler);
6405 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006406#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006407 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006408 Py_DECREF(v);
6409 return NULL;
6410 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006411#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006412 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006413 return (PyObject *)v;
6414
Benjamin Peterson29060642009-01-31 22:14:21 +00006415 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006416 Py_XDECREF(v);
6417 Py_XDECREF(errorHandler);
6418 Py_XDECREF(exc);
6419 return NULL;
6420}
6421
Guido van Rossumd57fd912000-03-10 22:53:23 +00006422/* --- Latin-1 Codec ------------------------------------------------------ */
6423
Alexander Belopolsky40018472011-02-26 01:02:56 +00006424PyObject *
6425PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006426 Py_ssize_t size,
6427 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006430 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006431}
6432
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006433/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006434static void
6435make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006436 const char *encoding,
6437 const Py_UNICODE *unicode, Py_ssize_t size,
6438 Py_ssize_t startpos, Py_ssize_t endpos,
6439 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006441 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 *exceptionObject = PyUnicodeEncodeError_Create(
6443 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006444 }
6445 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6447 goto onError;
6448 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6449 goto onError;
6450 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6451 goto onError;
6452 return;
6453 onError:
6454 Py_DECREF(*exceptionObject);
6455 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006456 }
6457}
6458
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006460static void
6461raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006462 const char *encoding,
6463 const Py_UNICODE *unicode, Py_ssize_t size,
6464 Py_ssize_t startpos, Py_ssize_t endpos,
6465 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006466{
6467 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006469 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006471}
6472
6473/* error handling callback helper:
6474 build arguments, call the callback and check the arguments,
6475 put the result into newpos and return the replacement string, which
6476 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006477static PyObject *
6478unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006479 PyObject **errorHandler,
6480 const char *encoding, const char *reason,
6481 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6482 Py_ssize_t startpos, Py_ssize_t endpos,
6483 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006484{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006485 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006486
6487 PyObject *restuple;
6488 PyObject *resunicode;
6489
6490 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006492 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006494 }
6495
6496 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006497 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006498 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006500
6501 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006503 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006505 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006506 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 Py_DECREF(restuple);
6508 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006509 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006510 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 &resunicode, newpos)) {
6512 Py_DECREF(restuple);
6513 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006514 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006515 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6516 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6517 Py_DECREF(restuple);
6518 return NULL;
6519 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006520 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006522 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006523 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6524 Py_DECREF(restuple);
6525 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006526 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 Py_INCREF(resunicode);
6528 Py_DECREF(restuple);
6529 return resunicode;
6530}
6531
Alexander Belopolsky40018472011-02-26 01:02:56 +00006532static PyObject *
6533unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006534 Py_ssize_t size,
6535 const char *errors,
6536 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006537{
6538 /* output object */
6539 PyObject *res;
6540 /* pointers to the beginning and end+1 of input */
6541 const Py_UNICODE *startp = p;
6542 const Py_UNICODE *endp = p + size;
6543 /* pointer to the beginning of the unencodable characters */
6544 /* const Py_UNICODE *badp = NULL; */
6545 /* pointer into the output */
6546 char *str;
6547 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006548 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006549 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6550 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006551 PyObject *errorHandler = NULL;
6552 PyObject *exc = NULL;
6553 /* the following variable is used for caching string comparisons
6554 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6555 int known_errorHandler = -1;
6556
6557 /* allocate enough for a simple encoding without
6558 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006559 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006560 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006561 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006562 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006563 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006564 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006565 ressize = size;
6566
6567 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006569
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 /* can we encode this? */
6571 if (c<limit) {
6572 /* no overflow check, because we know that the space is enough */
6573 *str++ = (char)c;
6574 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006575 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006576 else {
6577 Py_ssize_t unicodepos = p-startp;
6578 Py_ssize_t requiredsize;
6579 PyObject *repunicode;
6580 Py_ssize_t repsize;
6581 Py_ssize_t newpos;
6582 Py_ssize_t respos;
6583 Py_UNICODE *uni2;
6584 /* startpos for collecting unencodable chars */
6585 const Py_UNICODE *collstart = p;
6586 const Py_UNICODE *collend = p;
6587 /* find all unecodable characters */
6588 while ((collend < endp) && ((*collend)>=limit))
6589 ++collend;
6590 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6591 if (known_errorHandler==-1) {
6592 if ((errors==NULL) || (!strcmp(errors, "strict")))
6593 known_errorHandler = 1;
6594 else if (!strcmp(errors, "replace"))
6595 known_errorHandler = 2;
6596 else if (!strcmp(errors, "ignore"))
6597 known_errorHandler = 3;
6598 else if (!strcmp(errors, "xmlcharrefreplace"))
6599 known_errorHandler = 4;
6600 else
6601 known_errorHandler = 0;
6602 }
6603 switch (known_errorHandler) {
6604 case 1: /* strict */
6605 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6606 goto onError;
6607 case 2: /* replace */
6608 while (collstart++<collend)
6609 *str++ = '?'; /* fall through */
6610 case 3: /* ignore */
6611 p = collend;
6612 break;
6613 case 4: /* xmlcharrefreplace */
6614 respos = str - PyBytes_AS_STRING(res);
6615 /* determine replacement size (temporarily (mis)uses p) */
6616 for (p = collstart, repsize = 0; p < collend; ++p) {
6617 if (*p<10)
6618 repsize += 2+1+1;
6619 else if (*p<100)
6620 repsize += 2+2+1;
6621 else if (*p<1000)
6622 repsize += 2+3+1;
6623 else if (*p<10000)
6624 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006625#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006626 else
6627 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006628#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 else if (*p<100000)
6630 repsize += 2+5+1;
6631 else if (*p<1000000)
6632 repsize += 2+6+1;
6633 else
6634 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006635#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006636 }
6637 requiredsize = respos+repsize+(endp-collend);
6638 if (requiredsize > ressize) {
6639 if (requiredsize<2*ressize)
6640 requiredsize = 2*ressize;
6641 if (_PyBytes_Resize(&res, requiredsize))
6642 goto onError;
6643 str = PyBytes_AS_STRING(res) + respos;
6644 ressize = requiredsize;
6645 }
6646 /* generate replacement (temporarily (mis)uses p) */
6647 for (p = collstart; p < collend; ++p) {
6648 str += sprintf(str, "&#%d;", (int)*p);
6649 }
6650 p = collend;
6651 break;
6652 default:
6653 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6654 encoding, reason, startp, size, &exc,
6655 collstart-startp, collend-startp, &newpos);
6656 if (repunicode == NULL)
6657 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006658 if (PyBytes_Check(repunicode)) {
6659 /* Directly copy bytes result to output. */
6660 repsize = PyBytes_Size(repunicode);
6661 if (repsize > 1) {
6662 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006663 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006664 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6665 Py_DECREF(repunicode);
6666 goto onError;
6667 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006668 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006669 ressize += repsize-1;
6670 }
6671 memcpy(str, PyBytes_AsString(repunicode), repsize);
6672 str += repsize;
6673 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006674 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006675 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006676 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006677 /* need more space? (at least enough for what we
6678 have+the replacement+the rest of the string, so
6679 we won't have to check space for encodable characters) */
6680 respos = str - PyBytes_AS_STRING(res);
6681 repsize = PyUnicode_GET_SIZE(repunicode);
6682 requiredsize = respos+repsize+(endp-collend);
6683 if (requiredsize > ressize) {
6684 if (requiredsize<2*ressize)
6685 requiredsize = 2*ressize;
6686 if (_PyBytes_Resize(&res, requiredsize)) {
6687 Py_DECREF(repunicode);
6688 goto onError;
6689 }
6690 str = PyBytes_AS_STRING(res) + respos;
6691 ressize = requiredsize;
6692 }
6693 /* check if there is anything unencodable in the replacement
6694 and copy it to the output */
6695 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6696 c = *uni2;
6697 if (c >= limit) {
6698 raise_encode_exception(&exc, encoding, startp, size,
6699 unicodepos, unicodepos+1, reason);
6700 Py_DECREF(repunicode);
6701 goto onError;
6702 }
6703 *str = (char)c;
6704 }
6705 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006706 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006707 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006708 }
6709 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006710 /* Resize if we allocated to much */
6711 size = str - PyBytes_AS_STRING(res);
6712 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006713 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006714 if (_PyBytes_Resize(&res, size) < 0)
6715 goto onError;
6716 }
6717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006718 Py_XDECREF(errorHandler);
6719 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006720 return res;
6721
6722 onError:
6723 Py_XDECREF(res);
6724 Py_XDECREF(errorHandler);
6725 Py_XDECREF(exc);
6726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727}
6728
Alexander Belopolsky40018472011-02-26 01:02:56 +00006729PyObject *
6730PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006731 Py_ssize_t size,
6732 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006735}
6736
Alexander Belopolsky40018472011-02-26 01:02:56 +00006737PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739{
6740 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 PyErr_BadArgument();
6742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006744 if (PyUnicode_READY(unicode) == -1)
6745 return NULL;
6746 /* Fast path: if it is a one-byte string, construct
6747 bytes object directly. */
6748 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6749 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6750 PyUnicode_GET_LENGTH(unicode));
6751 /* Non-Latin-1 characters present. Defer to above function to
6752 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006754 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006755 errors);
6756}
6757
6758PyObject*
6759PyUnicode_AsLatin1String(PyObject *unicode)
6760{
6761 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006762}
6763
6764/* --- 7-bit ASCII Codec -------------------------------------------------- */
6765
Alexander Belopolsky40018472011-02-26 01:02:56 +00006766PyObject *
6767PyUnicode_DecodeASCII(const char *s,
6768 Py_ssize_t size,
6769 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006773 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006774 Py_ssize_t startinpos;
6775 Py_ssize_t endinpos;
6776 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006778 int has_error;
6779 const unsigned char *p = (const unsigned char *)s;
6780 const unsigned char *end = p + size;
6781 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006782 PyObject *errorHandler = NULL;
6783 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006784
Guido van Rossumd57fd912000-03-10 22:53:23 +00006785 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006786 if (size == 1 && (unsigned char)s[0] < 128)
6787 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006788
Victor Stinner702c7342011-10-05 13:50:52 +02006789 has_error = 0;
6790 while (p < end && !has_error) {
6791 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6792 an explanation. */
6793 if (!((size_t) p & LONG_PTR_MASK)) {
6794 /* Help register allocation */
6795 register const unsigned char *_p = p;
6796 while (_p < aligned_end) {
6797 unsigned long value = *(unsigned long *) _p;
6798 if (value & ASCII_CHAR_MASK) {
6799 has_error = 1;
6800 break;
6801 }
6802 _p += SIZEOF_LONG;
6803 }
6804 if (_p == end)
6805 break;
6806 if (has_error)
6807 break;
6808 p = _p;
6809 }
6810 if (*p & 0x80) {
6811 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006812 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006813 }
6814 else {
6815 ++p;
6816 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006817 }
Victor Stinner702c7342011-10-05 13:50:52 +02006818 if (!has_error)
6819 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006820
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 v = _PyUnicode_New(size);
6822 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006826 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006827 e = s + size;
6828 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 register unsigned char c = (unsigned char)*s;
6830 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006831 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006832 ++s;
6833 }
6834 else {
6835 startinpos = s-starts;
6836 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006837 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 if (unicode_decode_call_errorhandler(
6839 errors, &errorHandler,
6840 "ascii", "ordinal not in range(128)",
6841 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006842 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 goto onError;
6844 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 }
Victor Stinner702c7342011-10-05 13:50:52 +02006846 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6847 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849 Py_XDECREF(errorHandler);
6850 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006851#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006852 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006853 Py_DECREF(v);
6854 return NULL;
6855 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006856#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006857 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006859
Benjamin Peterson29060642009-01-31 22:14:21 +00006860 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 Py_XDECREF(errorHandler);
6863 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 return NULL;
6865}
6866
Alexander Belopolsky40018472011-02-26 01:02:56 +00006867PyObject *
6868PyUnicode_EncodeASCII(const Py_UNICODE *p,
6869 Py_ssize_t size,
6870 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006871{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006873}
6874
Alexander Belopolsky40018472011-02-26 01:02:56 +00006875PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006876_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877{
6878 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 PyErr_BadArgument();
6880 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006882 if (PyUnicode_READY(unicode) == -1)
6883 return NULL;
6884 /* Fast path: if it is an ASCII-only string, construct bytes object
6885 directly. Else defer to above function to raise the exception. */
6886 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6887 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6888 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006889 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006891 errors);
6892}
6893
6894PyObject *
6895PyUnicode_AsASCIIString(PyObject *unicode)
6896{
6897 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898}
6899
Victor Stinner99b95382011-07-04 14:23:54 +02006900#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006901
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006902/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006903
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006904#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006905#define NEED_RETRY
6906#endif
6907
Victor Stinner3a50e702011-10-18 21:21:00 +02006908#ifndef WC_ERR_INVALID_CHARS
6909# define WC_ERR_INVALID_CHARS 0x0080
6910#endif
6911
6912static char*
6913code_page_name(UINT code_page, PyObject **obj)
6914{
6915 *obj = NULL;
6916 if (code_page == CP_ACP)
6917 return "mbcs";
6918 if (code_page == CP_UTF7)
6919 return "CP_UTF7";
6920 if (code_page == CP_UTF8)
6921 return "CP_UTF8";
6922
6923 *obj = PyBytes_FromFormat("cp%u", code_page);
6924 if (*obj == NULL)
6925 return NULL;
6926 return PyBytes_AS_STRING(*obj);
6927}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006928
Alexander Belopolsky40018472011-02-26 01:02:56 +00006929static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006930is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006931{
6932 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006933 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934
Victor Stinner3a50e702011-10-18 21:21:00 +02006935 if (!IsDBCSLeadByteEx(code_page, *curr))
6936 return 0;
6937
6938 prev = CharPrevExA(code_page, s, curr, 0);
6939 if (prev == curr)
6940 return 1;
6941 /* FIXME: This code is limited to "true" double-byte encodings,
6942 as it assumes an incomplete character consists of a single
6943 byte. */
6944 if (curr - prev == 2)
6945 return 1;
6946 if (!IsDBCSLeadByteEx(code_page, *prev))
6947 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006948 return 0;
6949}
6950
Victor Stinner3a50e702011-10-18 21:21:00 +02006951static DWORD
6952decode_code_page_flags(UINT code_page)
6953{
6954 if (code_page == CP_UTF7) {
6955 /* The CP_UTF7 decoder only supports flags=0 */
6956 return 0;
6957 }
6958 else
6959 return MB_ERR_INVALID_CHARS;
6960}
6961
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 * Decode a byte string from a Windows code page into unicode object in strict
6964 * mode.
6965 *
6966 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6967 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006969static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006970decode_code_page_strict(UINT code_page,
6971 PyUnicodeObject **v,
6972 const char *in,
6973 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006974{
Victor Stinner3a50e702011-10-18 21:21:00 +02006975 const DWORD flags = decode_code_page_flags(code_page);
6976 Py_UNICODE *out;
6977 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006978
6979 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006980 assert(insize > 0);
6981 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6982 if (outsize <= 0)
6983 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006984
6985 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006986 /* Create unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006987 *v = _PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 if (*v == NULL)
6989 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006990 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006991 }
6992 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006993 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006994 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6995 if (PyUnicode_Resize((PyObject**)v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006996 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006998 }
6999
7000 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007001 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7002 if (outsize <= 0)
7003 goto error;
7004 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007005
Victor Stinner3a50e702011-10-18 21:21:00 +02007006error:
7007 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7008 return -2;
7009 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007010 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011}
7012
Victor Stinner3a50e702011-10-18 21:21:00 +02007013/*
7014 * Decode a byte string from a code page into unicode object with an error
7015 * handler.
7016 *
7017 * Returns consumed size if succeed, or raise a WindowsError or
7018 * UnicodeDecodeError exception and returns -1 on error.
7019 */
7020static int
7021decode_code_page_errors(UINT code_page,
7022 PyUnicodeObject **v,
7023 const char *in,
7024 int size,
7025 const char *errors)
7026{
7027 const char *startin = in;
7028 const char *endin = in + size;
7029 const DWORD flags = decode_code_page_flags(code_page);
7030 /* Ideally, we should get reason from FormatMessage. This is the Windows
7031 2000 English version of the message. */
7032 const char *reason = "No mapping for the Unicode character exists "
7033 "in the target code page.";
7034 /* each step cannot decode more than 1 character, but a character can be
7035 represented as a surrogate pair */
7036 wchar_t buffer[2], *startout, *out;
7037 int insize, outsize;
7038 PyObject *errorHandler = NULL;
7039 PyObject *exc = NULL;
7040 PyObject *encoding_obj = NULL;
7041 char *encoding;
7042 DWORD err;
7043 int ret = -1;
7044
7045 assert(size > 0);
7046
7047 encoding = code_page_name(code_page, &encoding_obj);
7048 if (encoding == NULL)
7049 return -1;
7050
7051 if (errors == NULL || strcmp(errors, "strict") == 0) {
7052 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7053 UnicodeDecodeError. */
7054 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7055 if (exc != NULL) {
7056 PyCodec_StrictErrors(exc);
7057 Py_CLEAR(exc);
7058 }
7059 goto error;
7060 }
7061
7062 if (*v == NULL) {
7063 /* Create unicode object */
7064 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7065 PyErr_NoMemory();
7066 goto error;
7067 }
7068 *v = _PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
7069 if (*v == NULL)
7070 goto error;
7071 startout = PyUnicode_AS_UNICODE(*v);
7072 }
7073 else {
7074 /* Extend unicode object */
7075 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7076 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7077 PyErr_NoMemory();
7078 goto error;
7079 }
7080 if (PyUnicode_Resize((PyObject**)v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7081 goto error;
7082 startout = PyUnicode_AS_UNICODE(*v) + n;
7083 }
7084
7085 /* Decode the byte string character per character */
7086 out = startout;
7087 while (in < endin)
7088 {
7089 /* Decode a character */
7090 insize = 1;
7091 do
7092 {
7093 outsize = MultiByteToWideChar(code_page, flags,
7094 in, insize,
7095 buffer, Py_ARRAY_LENGTH(buffer));
7096 if (outsize > 0)
7097 break;
7098 err = GetLastError();
7099 if (err != ERROR_NO_UNICODE_TRANSLATION
7100 && err != ERROR_INSUFFICIENT_BUFFER)
7101 {
7102 PyErr_SetFromWindowsErr(0);
7103 goto error;
7104 }
7105 insize++;
7106 }
7107 /* 4=maximum length of a UTF-8 sequence */
7108 while (insize <= 4 && (in + insize) <= endin);
7109
7110 if (outsize <= 0) {
7111 Py_ssize_t startinpos, endinpos, outpos;
7112
7113 startinpos = in - startin;
7114 endinpos = startinpos + 1;
7115 outpos = out - PyUnicode_AS_UNICODE(*v);
7116 if (unicode_decode_call_errorhandler(
7117 errors, &errorHandler,
7118 encoding, reason,
7119 &startin, &endin, &startinpos, &endinpos, &exc, &in,
7120 v, &outpos, &out))
7121 {
7122 goto error;
7123 }
7124 }
7125 else {
7126 in += insize;
7127 memcpy(out, buffer, outsize * sizeof(wchar_t));
7128 out += outsize;
7129 }
7130 }
7131
7132 /* write a NUL character at the end */
7133 *out = 0;
7134
7135 /* Extend unicode object */
7136 outsize = out - startout;
7137 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7138 if (PyUnicode_Resize((PyObject**)v, outsize) < 0)
7139 goto error;
7140 ret = 0;
7141
7142error:
7143 Py_XDECREF(encoding_obj);
7144 Py_XDECREF(errorHandler);
7145 Py_XDECREF(exc);
7146 return ret;
7147}
7148
7149/*
7150 * Decode a byte string from a Windows code page into unicode object. If
7151 * 'final' is set, converts trailing lead-byte too.
7152 *
7153 * Returns consumed size if succeed, or raise a WindowsError or
7154 * UnicodeDecodeError exception and returns -1 on error.
7155 */
7156static int
7157decode_code_page(UINT code_page,
7158 PyUnicodeObject **v,
7159 const char *s, int size,
7160 int final, const char *errors)
7161{
7162 int done;
7163
7164 /* Skip trailing lead-byte unless 'final' is set */
7165 if (size == 0) {
7166 if (*v == NULL) {
7167 Py_INCREF(unicode_empty);
7168 *v = (PyUnicodeObject*)unicode_empty;
7169 if (*v == NULL)
7170 return -1;
7171 }
7172 return 0;
7173 }
7174
7175 if (!final && is_dbcs_lead_byte(code_page, s, size - 1))
7176 --size;
7177
7178 done = decode_code_page_strict(code_page, v, s, size);
7179 if (done == -2)
7180 done = decode_code_page_errors(code_page, v, s, size, errors);
7181 return done;
7182}
7183
7184static PyObject *
7185decode_code_page_stateful(int code_page,
7186 const char *s,
7187 Py_ssize_t size,
7188 const char *errors,
7189 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007190{
7191 PyUnicodeObject *v = NULL;
7192 int done;
7193
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 if (code_page < 0) {
7195 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7196 return NULL;
7197 }
7198
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007200 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201
7202#ifdef NEED_RETRY
7203 retry:
7204 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 done = decode_code_page(code_page, &v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007206 else
7207#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 done = decode_code_page(code_page, &v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007209
7210 if (done < 0) {
7211 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213 }
7214
7215 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007217
7218#ifdef NEED_RETRY
7219 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007220 s += done;
7221 size -= done;
7222 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007223 }
7224#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007225
Victor Stinner17efeed2011-10-04 20:05:46 +02007226#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007227 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007228 Py_DECREF(v);
7229 return NULL;
7230 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007231#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007232 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007233 return (PyObject *)v;
7234}
7235
Alexander Belopolsky40018472011-02-26 01:02:56 +00007236PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007237PyUnicode_DecodeCodePageStateful(int code_page,
7238 const char *s,
7239 Py_ssize_t size,
7240 const char *errors,
7241 Py_ssize_t *consumed)
7242{
7243 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7244}
7245
7246PyObject *
7247PyUnicode_DecodeMBCSStateful(const char *s,
7248 Py_ssize_t size,
7249 const char *errors,
7250 Py_ssize_t *consumed)
7251{
7252 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7253}
7254
7255PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007256PyUnicode_DecodeMBCS(const char *s,
7257 Py_ssize_t size,
7258 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007259{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007260 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7261}
7262
Victor Stinner3a50e702011-10-18 21:21:00 +02007263static DWORD
7264encode_code_page_flags(UINT code_page, const char *errors)
7265{
7266 if (code_page == CP_UTF8) {
7267 if (winver.dwMajorVersion >= 6)
7268 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7269 and later */
7270 return WC_ERR_INVALID_CHARS;
7271 else
7272 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7273 return 0;
7274 }
7275 else if (code_page == CP_UTF7) {
7276 /* CP_UTF7 only supports flags=0 */
7277 return 0;
7278 }
7279 else {
7280 if (errors != NULL && strcmp(errors, "replace") == 0)
7281 return 0;
7282 else
7283 return WC_NO_BEST_FIT_CHARS;
7284 }
7285}
7286
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007287/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 * Encode a Unicode string to a Windows code page into a byte string in strict
7289 * mode.
7290 *
7291 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7292 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007293 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007294static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007295encode_code_page_strict(UINT code_page, PyObject **outbytes,
7296 const Py_UNICODE *p, const int size,
7297 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007298{
Victor Stinner554f3f02010-06-16 23:33:54 +00007299 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 BOOL *pusedDefaultChar = &usedDefaultChar;
7301 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007302 PyObject *exc = NULL;
Victor Stinner3a50e702011-10-18 21:21:00 +02007303 const DWORD flags = encode_code_page_flags(code_page, NULL);
7304 char *out;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 assert(size > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007307
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007309 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007311 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007312
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007313 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 outsize = WideCharToMultiByte(code_page, flags,
7315 p, size,
7316 NULL, 0,
7317 NULL, pusedDefaultChar);
7318 if (outsize <= 0)
7319 goto error;
7320 /* If we used a default char, then we failed! */
7321 if (pusedDefaultChar && *pusedDefaultChar)
7322 return -2;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007323
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007325 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007326 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7327 if (*outbytes == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007328 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007329 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007330 }
7331 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007332 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 const Py_ssize_t n = PyBytes_Size(*outbytes);
7334 if (outsize > PY_SSIZE_T_MAX - n) {
7335 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007336 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007337 }
7338 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7339 return -1;
7340 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341 }
7342
7343 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007344 outsize = WideCharToMultiByte(code_page, flags,
7345 p, size,
7346 out, outsize,
7347 NULL, pusedDefaultChar);
7348 if (outsize <= 0)
7349 goto error;
7350 if (pusedDefaultChar && *pusedDefaultChar)
7351 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007353
Victor Stinner3a50e702011-10-18 21:21:00 +02007354error:
7355 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7356 return -2;
7357 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007358 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007359}
7360
Victor Stinner3a50e702011-10-18 21:21:00 +02007361/*
7362 * Encode a Unicode string to a Windows code page into a byte string using a
7363 * error handler.
7364 *
7365 * Returns consumed characters if succeed, or raise a WindowsError and returns
7366 * -1 on other error.
7367 */
7368static int
7369encode_code_page_errors(UINT code_page, PyObject **outbytes,
7370 const Py_UNICODE *in, const int insize,
7371 const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007372{
Victor Stinner3a50e702011-10-18 21:21:00 +02007373 const DWORD flags = encode_code_page_flags(code_page, errors);
7374 const Py_UNICODE *startin = in;
7375 const Py_UNICODE *endin = in + insize;
7376 /* Ideally, we should get reason from FormatMessage. This is the Windows
7377 2000 English version of the message. */
7378 const char *reason = "invalid character";
7379 /* 4=maximum length of a UTF-8 sequence */
7380 char buffer[4];
7381 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7382 Py_ssize_t outsize;
7383 char *out;
7384 int charsize;
7385 PyObject *errorHandler = NULL;
7386 PyObject *exc = NULL;
7387 PyObject *encoding_obj = NULL;
7388 char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 Py_ssize_t startpos, newpos, newoutsize;
7390 PyObject *rep;
7391 int ret = -1;
7392
7393 assert(insize > 0);
7394
7395 encoding = code_page_name(code_page, &encoding_obj);
7396 if (encoding == NULL)
7397 return -1;
7398
7399 if (errors == NULL || strcmp(errors, "strict") == 0) {
7400 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7401 then we raise a UnicodeEncodeError. */
7402 make_encode_exception(&exc, encoding, in, insize, 0, 0, reason);
7403 if (exc != NULL) {
7404 PyCodec_StrictErrors(exc);
7405 Py_DECREF(exc);
7406 }
7407 Py_XDECREF(encoding_obj);
7408 return -1;
7409 }
7410
7411 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7412 pusedDefaultChar = &usedDefaultChar;
7413 else
7414 pusedDefaultChar = NULL;
7415
7416 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7417 PyErr_NoMemory();
7418 goto error;
7419 }
7420 outsize = insize * Py_ARRAY_LENGTH(buffer);
7421
7422 if (*outbytes == NULL) {
7423 /* Create string object */
7424 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7425 if (*outbytes == NULL)
7426 goto error;
7427 out = PyBytes_AS_STRING(*outbytes);
7428 }
7429 else {
7430 /* Extend string object */
7431 Py_ssize_t n = PyBytes_Size(*outbytes);
7432 if (n > PY_SSIZE_T_MAX - outsize) {
7433 PyErr_NoMemory();
7434 goto error;
7435 }
7436 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7437 goto error;
7438 out = PyBytes_AS_STRING(*outbytes) + n;
7439 }
7440
7441 /* Encode the string character per character */
7442 while (in < endin)
7443 {
7444 if ((in + 2) <= endin
7445 && 0xD800 <= in[0] && in[0] <= 0xDBFF
7446 && 0xDC00 <= in[1] && in[1] <= 0xDFFF)
7447 charsize = 2;
7448 else
7449 charsize = 1;
7450
7451 outsize = WideCharToMultiByte(code_page, flags,
7452 in, charsize,
7453 buffer, Py_ARRAY_LENGTH(buffer),
7454 NULL, pusedDefaultChar);
7455 if (outsize > 0) {
7456 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7457 {
7458 in += charsize;
7459 memcpy(out, buffer, outsize);
7460 out += outsize;
7461 continue;
7462 }
7463 }
7464 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7465 PyErr_SetFromWindowsErr(0);
7466 goto error;
7467 }
7468
7469 charsize = Py_MAX(charsize - 1, 1);
7470 startpos = in - startin;
7471 rep = unicode_encode_call_errorhandler(
7472 errors, &errorHandler, encoding, reason,
7473 startin, insize, &exc,
7474 startpos, startpos + charsize, &newpos);
7475 if (rep == NULL)
7476 goto error;
7477 in = startin + newpos;
7478
7479 if (PyBytes_Check(rep)) {
7480 outsize = PyBytes_GET_SIZE(rep);
7481 if (outsize != 1) {
7482 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7483 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7484 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7485 Py_DECREF(rep);
7486 goto error;
7487 }
7488 out = PyBytes_AS_STRING(*outbytes) + offset;
7489 }
7490 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7491 out += outsize;
7492 }
7493 else {
7494 Py_ssize_t i;
7495 enum PyUnicode_Kind kind;
7496 void *data;
7497
7498 if (PyUnicode_READY(rep) < 0) {
7499 Py_DECREF(rep);
7500 goto error;
7501 }
7502
7503 outsize = PyUnicode_GET_LENGTH(rep);
7504 if (outsize != 1) {
7505 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7506 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7507 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7508 Py_DECREF(rep);
7509 goto error;
7510 }
7511 out = PyBytes_AS_STRING(*outbytes) + offset;
7512 }
7513 kind = PyUnicode_KIND(rep);
7514 data = PyUnicode_DATA(rep);
7515 for (i=0; i < outsize; i++) {
7516 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7517 if (ch > 127) {
7518 raise_encode_exception(&exc,
7519 encoding,
7520 startin, insize,
7521 startpos, startpos + charsize,
7522 "unable to encode error handler result to ASCII");
7523 Py_DECREF(rep);
7524 goto error;
7525 }
7526 *out = (unsigned char)ch;
7527 out++;
7528 }
7529 }
7530 Py_DECREF(rep);
7531 }
7532 /* write a NUL byte */
7533 *out = 0;
7534 outsize = out - PyBytes_AS_STRING(*outbytes);
7535 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7536 if (_PyBytes_Resize(outbytes, outsize) < 0)
7537 goto error;
7538 ret = 0;
7539
7540error:
7541 Py_XDECREF(encoding_obj);
7542 Py_XDECREF(errorHandler);
7543 Py_XDECREF(exc);
7544 return ret;
7545}
7546
7547/*
7548 * Encode a Unicode string to a Windows code page into a byte string.
7549 *
7550 * Returns consumed characters if succeed, or raise a WindowsError and returns
7551 * -1 on other error.
7552 */
7553static int
7554encode_code_page_chunk(UINT code_page, PyObject **outbytes,
7555 const Py_UNICODE *p, int size,
7556 const char* errors)
7557{
7558 int done;
7559
7560 if (size == 0) {
7561 if (*outbytes == NULL) {
7562 *outbytes = PyBytes_FromStringAndSize(NULL, 0);
7563 if (*outbytes == NULL)
7564 return -1;
7565 }
7566 return 0;
7567 }
7568
7569 done = encode_code_page_strict(code_page, outbytes, p, size, errors);
7570 if (done == -2)
7571 done = encode_code_page_errors(code_page, outbytes, p, size, errors);
7572 return done;
7573}
7574
7575static PyObject *
7576encode_code_page(int code_page,
7577 const Py_UNICODE *p, Py_ssize_t size,
7578 const char *errors)
7579{
7580 PyObject *outbytes = NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007581 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00007582
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 if (code_page < 0) {
7584 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7585 return NULL;
7586 }
7587
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007588#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590 if (size > INT_MAX)
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 ret = encode_code_page_chunk(code_page, &outbytes, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592 else
7593#endif
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 ret = encode_code_page_chunk(code_page, &outbytes, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007595
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596 if (ret < 0) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007597 Py_XDECREF(outbytes);
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007599 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007600
7601#ifdef NEED_RETRY
7602 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 p += INT_MAX;
7604 size -= INT_MAX;
7605 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007606 }
7607#endif
7608
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 return outbytes;
7610}
7611
7612PyObject *
7613PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7614 Py_ssize_t size,
7615 const char *errors)
7616{
7617 return encode_code_page(CP_ACP, p, size, errors);
7618}
7619
7620PyObject *
7621PyUnicode_EncodeCodePage(int code_page,
7622 PyObject *unicode,
7623 const char *errors)
7624{
7625 const Py_UNICODE *p;
7626 Py_ssize_t size;
7627 p = PyUnicode_AsUnicodeAndSize(unicode, &size);
7628 if (p == NULL)
7629 return NULL;
7630 return encode_code_page(code_page, p, size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007631}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007632
Alexander Belopolsky40018472011-02-26 01:02:56 +00007633PyObject *
7634PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007635{
7636 if (!PyUnicode_Check(unicode)) {
7637 PyErr_BadArgument();
7638 return NULL;
7639 }
7640 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007641 PyUnicode_GET_SIZE(unicode),
7642 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007643}
7644
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007645#undef NEED_RETRY
7646
Victor Stinner99b95382011-07-04 14:23:54 +02007647#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007648
Guido van Rossumd57fd912000-03-10 22:53:23 +00007649/* --- Character Mapping Codec -------------------------------------------- */
7650
Alexander Belopolsky40018472011-02-26 01:02:56 +00007651PyObject *
7652PyUnicode_DecodeCharmap(const char *s,
7653 Py_ssize_t size,
7654 PyObject *mapping,
7655 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007656{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007657 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007658 Py_ssize_t startinpos;
7659 Py_ssize_t endinpos;
7660 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007662 PyUnicodeObject *v;
7663 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007664 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007665 PyObject *errorHandler = NULL;
7666 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007667 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007668 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007669
Guido van Rossumd57fd912000-03-10 22:53:23 +00007670 /* Default to Latin-1 */
7671 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007673
7674 v = _PyUnicode_New(size);
7675 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007676 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007677 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007679 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007680 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007681 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 mapstring = PyUnicode_AS_UNICODE(mapping);
7683 maplen = PyUnicode_GET_SIZE(mapping);
7684 while (s < e) {
7685 unsigned char ch = *s;
7686 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007687
Benjamin Peterson29060642009-01-31 22:14:21 +00007688 if (ch < maplen)
7689 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007690
Benjamin Peterson29060642009-01-31 22:14:21 +00007691 if (x == 0xfffe) {
7692 /* undefined mapping */
7693 outpos = p-PyUnicode_AS_UNICODE(v);
7694 startinpos = s-starts;
7695 endinpos = startinpos+1;
7696 if (unicode_decode_call_errorhandler(
7697 errors, &errorHandler,
7698 "charmap", "character maps to <undefined>",
7699 &starts, &e, &startinpos, &endinpos, &exc, &s,
7700 &v, &outpos, &p)) {
7701 goto onError;
7702 }
7703 continue;
7704 }
7705 *p++ = x;
7706 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007707 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007708 }
7709 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007710 while (s < e) {
7711 unsigned char ch = *s;
7712 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007713
Benjamin Peterson29060642009-01-31 22:14:21 +00007714 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7715 w = PyLong_FromLong((long)ch);
7716 if (w == NULL)
7717 goto onError;
7718 x = PyObject_GetItem(mapping, w);
7719 Py_DECREF(w);
7720 if (x == NULL) {
7721 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7722 /* No mapping found means: mapping is undefined. */
7723 PyErr_Clear();
7724 x = Py_None;
7725 Py_INCREF(x);
7726 } else
7727 goto onError;
7728 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007729
Benjamin Peterson29060642009-01-31 22:14:21 +00007730 /* Apply mapping */
7731 if (PyLong_Check(x)) {
7732 long value = PyLong_AS_LONG(x);
7733 if (value < 0 || value > 65535) {
7734 PyErr_SetString(PyExc_TypeError,
7735 "character mapping must be in range(65536)");
7736 Py_DECREF(x);
7737 goto onError;
7738 }
7739 *p++ = (Py_UNICODE)value;
7740 }
7741 else if (x == Py_None) {
7742 /* undefined mapping */
7743 outpos = p-PyUnicode_AS_UNICODE(v);
7744 startinpos = s-starts;
7745 endinpos = startinpos+1;
7746 if (unicode_decode_call_errorhandler(
7747 errors, &errorHandler,
7748 "charmap", "character maps to <undefined>",
7749 &starts, &e, &startinpos, &endinpos, &exc, &s,
7750 &v, &outpos, &p)) {
7751 Py_DECREF(x);
7752 goto onError;
7753 }
7754 Py_DECREF(x);
7755 continue;
7756 }
7757 else if (PyUnicode_Check(x)) {
7758 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 if (targetsize == 1)
7761 /* 1-1 mapping */
7762 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 else if (targetsize > 1) {
7765 /* 1-n mapping */
7766 if (targetsize > extrachars) {
7767 /* resize first */
7768 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7769 Py_ssize_t needed = (targetsize - extrachars) + \
7770 (targetsize << 2);
7771 extrachars += needed;
7772 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007773 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007774 PyUnicode_GET_SIZE(v) + needed) < 0) {
7775 Py_DECREF(x);
7776 goto onError;
7777 }
7778 p = PyUnicode_AS_UNICODE(v) + oldpos;
7779 }
7780 Py_UNICODE_COPY(p,
7781 PyUnicode_AS_UNICODE(x),
7782 targetsize);
7783 p += targetsize;
7784 extrachars -= targetsize;
7785 }
7786 /* 1-0 mapping: skip the character */
7787 }
7788 else {
7789 /* wrong return value */
7790 PyErr_SetString(PyExc_TypeError,
7791 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007792 Py_DECREF(x);
7793 goto onError;
7794 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 Py_DECREF(x);
7796 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007797 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007798 }
7799 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007800 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007801 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007802 Py_XDECREF(errorHandler);
7803 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007804#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007805 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007806 Py_DECREF(v);
7807 return NULL;
7808 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007809#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007810 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007811 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007812
Benjamin Peterson29060642009-01-31 22:14:21 +00007813 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814 Py_XDECREF(errorHandler);
7815 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007816 Py_XDECREF(v);
7817 return NULL;
7818}
7819
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007820/* Charmap encoding: the lookup table */
7821
Alexander Belopolsky40018472011-02-26 01:02:56 +00007822struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 PyObject_HEAD
7824 unsigned char level1[32];
7825 int count2, count3;
7826 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827};
7828
7829static PyObject*
7830encoding_map_size(PyObject *obj, PyObject* args)
7831{
7832 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007833 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007834 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007835}
7836
7837static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007838 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007839 PyDoc_STR("Return the size (in bytes) of this object") },
7840 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841};
7842
7843static void
7844encoding_map_dealloc(PyObject* o)
7845{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007846 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847}
7848
7849static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007850 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007851 "EncodingMap", /*tp_name*/
7852 sizeof(struct encoding_map), /*tp_basicsize*/
7853 0, /*tp_itemsize*/
7854 /* methods */
7855 encoding_map_dealloc, /*tp_dealloc*/
7856 0, /*tp_print*/
7857 0, /*tp_getattr*/
7858 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007859 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 0, /*tp_repr*/
7861 0, /*tp_as_number*/
7862 0, /*tp_as_sequence*/
7863 0, /*tp_as_mapping*/
7864 0, /*tp_hash*/
7865 0, /*tp_call*/
7866 0, /*tp_str*/
7867 0, /*tp_getattro*/
7868 0, /*tp_setattro*/
7869 0, /*tp_as_buffer*/
7870 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7871 0, /*tp_doc*/
7872 0, /*tp_traverse*/
7873 0, /*tp_clear*/
7874 0, /*tp_richcompare*/
7875 0, /*tp_weaklistoffset*/
7876 0, /*tp_iter*/
7877 0, /*tp_iternext*/
7878 encoding_map_methods, /*tp_methods*/
7879 0, /*tp_members*/
7880 0, /*tp_getset*/
7881 0, /*tp_base*/
7882 0, /*tp_dict*/
7883 0, /*tp_descr_get*/
7884 0, /*tp_descr_set*/
7885 0, /*tp_dictoffset*/
7886 0, /*tp_init*/
7887 0, /*tp_alloc*/
7888 0, /*tp_new*/
7889 0, /*tp_free*/
7890 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007891};
7892
7893PyObject*
7894PyUnicode_BuildEncodingMap(PyObject* string)
7895{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 PyObject *result;
7897 struct encoding_map *mresult;
7898 int i;
7899 int need_dict = 0;
7900 unsigned char level1[32];
7901 unsigned char level2[512];
7902 unsigned char *mlevel1, *mlevel2, *mlevel3;
7903 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 int kind;
7905 void *data;
7906 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007907
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 PyErr_BadArgument();
7910 return NULL;
7911 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007912 kind = PyUnicode_KIND(string);
7913 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 memset(level1, 0xFF, sizeof level1);
7915 memset(level2, 0xFF, sizeof level2);
7916
7917 /* If there isn't a one-to-one mapping of NULL to \0,
7918 or if there are non-BMP characters, we need to use
7919 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007921 need_dict = 1;
7922 for (i = 1; i < 256; i++) {
7923 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007924 ch = PyUnicode_READ(kind, data, i);
7925 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007926 need_dict = 1;
7927 break;
7928 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007929 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007930 /* unmapped character */
7931 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007932 l1 = ch >> 11;
7933 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007934 if (level1[l1] == 0xFF)
7935 level1[l1] = count2++;
7936 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007937 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007938 }
7939
7940 if (count2 >= 0xFF || count3 >= 0xFF)
7941 need_dict = 1;
7942
7943 if (need_dict) {
7944 PyObject *result = PyDict_New();
7945 PyObject *key, *value;
7946 if (!result)
7947 return NULL;
7948 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007949 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007950 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007951 if (!key || !value)
7952 goto failed1;
7953 if (PyDict_SetItem(result, key, value) == -1)
7954 goto failed1;
7955 Py_DECREF(key);
7956 Py_DECREF(value);
7957 }
7958 return result;
7959 failed1:
7960 Py_XDECREF(key);
7961 Py_XDECREF(value);
7962 Py_DECREF(result);
7963 return NULL;
7964 }
7965
7966 /* Create a three-level trie */
7967 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7968 16*count2 + 128*count3 - 1);
7969 if (!result)
7970 return PyErr_NoMemory();
7971 PyObject_Init(result, &EncodingMapType);
7972 mresult = (struct encoding_map*)result;
7973 mresult->count2 = count2;
7974 mresult->count3 = count3;
7975 mlevel1 = mresult->level1;
7976 mlevel2 = mresult->level23;
7977 mlevel3 = mresult->level23 + 16*count2;
7978 memcpy(mlevel1, level1, 32);
7979 memset(mlevel2, 0xFF, 16*count2);
7980 memset(mlevel3, 0, 128*count3);
7981 count3 = 0;
7982 for (i = 1; i < 256; i++) {
7983 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007984 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007985 /* unmapped character */
7986 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007987 o1 = PyUnicode_READ(kind, data, i)>>11;
7988 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007989 i2 = 16*mlevel1[o1] + o2;
7990 if (mlevel2[i2] == 0xFF)
7991 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007992 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007993 i3 = 128*mlevel2[i2] + o3;
7994 mlevel3[i3] = i;
7995 }
7996 return result;
7997}
7998
7999static int
8000encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
8001{
8002 struct encoding_map *map = (struct encoding_map*)mapping;
8003 int l1 = c>>11;
8004 int l2 = (c>>7) & 0xF;
8005 int l3 = c & 0x7F;
8006 int i;
8007
8008#ifdef Py_UNICODE_WIDE
8009 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008010 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008011 }
8012#endif
8013 if (c == 0)
8014 return 0;
8015 /* level 1*/
8016 i = map->level1[l1];
8017 if (i == 0xFF) {
8018 return -1;
8019 }
8020 /* level 2*/
8021 i = map->level23[16*i+l2];
8022 if (i == 0xFF) {
8023 return -1;
8024 }
8025 /* level 3 */
8026 i = map->level23[16*map->count2 + 128*i + l3];
8027 if (i == 0) {
8028 return -1;
8029 }
8030 return i;
8031}
8032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008033/* Lookup the character ch in the mapping. If the character
8034 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008035 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008036static PyObject *
8037charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008038{
Christian Heimes217cfd12007-12-02 14:31:20 +00008039 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008040 PyObject *x;
8041
8042 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008044 x = PyObject_GetItem(mapping, w);
8045 Py_DECREF(w);
8046 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8048 /* No mapping found means: mapping is undefined. */
8049 PyErr_Clear();
8050 x = Py_None;
8051 Py_INCREF(x);
8052 return x;
8053 } else
8054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008055 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008056 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008058 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 long value = PyLong_AS_LONG(x);
8060 if (value < 0 || value > 255) {
8061 PyErr_SetString(PyExc_TypeError,
8062 "character mapping must be in range(256)");
8063 Py_DECREF(x);
8064 return NULL;
8065 }
8066 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008068 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 /* wrong return value */
8072 PyErr_Format(PyExc_TypeError,
8073 "character mapping must return integer, bytes or None, not %.400s",
8074 x->ob_type->tp_name);
8075 Py_DECREF(x);
8076 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077 }
8078}
8079
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008080static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008081charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008083 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8084 /* exponentially overallocate to minimize reallocations */
8085 if (requiredsize < 2*outsize)
8086 requiredsize = 2*outsize;
8087 if (_PyBytes_Resize(outobj, requiredsize))
8088 return -1;
8089 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090}
8091
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008094} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008096 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008097 space is available. Return a new reference to the object that
8098 was put in the output buffer, or Py_None, if the mapping was undefined
8099 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008100 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008101static charmapencode_result
8102charmapencode_output(Py_UNICODE c, PyObject *mapping,
8103 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 PyObject *rep;
8106 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008107 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108
Christian Heimes90aa7642007-12-19 02:45:37 +00008109 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008111 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008112 if (res == -1)
8113 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 if (outsize<requiredsize)
8115 if (charmapencode_resize(outobj, outpos, requiredsize))
8116 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008117 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 outstart[(*outpos)++] = (char)res;
8119 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008120 }
8121
8122 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008123 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008125 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008126 Py_DECREF(rep);
8127 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008128 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 if (PyLong_Check(rep)) {
8130 Py_ssize_t requiredsize = *outpos+1;
8131 if (outsize<requiredsize)
8132 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8133 Py_DECREF(rep);
8134 return enc_EXCEPTION;
8135 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008136 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008137 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008138 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008139 else {
8140 const char *repchars = PyBytes_AS_STRING(rep);
8141 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8142 Py_ssize_t requiredsize = *outpos+repsize;
8143 if (outsize<requiredsize)
8144 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8145 Py_DECREF(rep);
8146 return enc_EXCEPTION;
8147 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008148 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 memcpy(outstart + *outpos, repchars, repsize);
8150 *outpos += repsize;
8151 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008152 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 Py_DECREF(rep);
8154 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008155}
8156
8157/* handle an error in PyUnicode_EncodeCharmap
8158 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008159static int
8160charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00008161 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008162 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008163 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008164 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008165{
8166 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008167 Py_ssize_t repsize;
8168 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008169 Py_UNICODE *uni2;
8170 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008171 Py_ssize_t collstartpos = *inpos;
8172 Py_ssize_t collendpos = *inpos+1;
8173 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008174 char *encoding = "charmap";
8175 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 /* find all unencodable characters */
8179 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008180 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008181 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 int res = encoding_map_lookup(p[collendpos], mapping);
8183 if (res != -1)
8184 break;
8185 ++collendpos;
8186 continue;
8187 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 rep = charmapencode_lookup(p[collendpos], mapping);
8190 if (rep==NULL)
8191 return -1;
8192 else if (rep!=Py_None) {
8193 Py_DECREF(rep);
8194 break;
8195 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008196 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008198 }
8199 /* cache callback name lookup
8200 * (if not done yet, i.e. it's the first error) */
8201 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008202 if ((errors==NULL) || (!strcmp(errors, "strict")))
8203 *known_errorHandler = 1;
8204 else if (!strcmp(errors, "replace"))
8205 *known_errorHandler = 2;
8206 else if (!strcmp(errors, "ignore"))
8207 *known_errorHandler = 3;
8208 else if (!strcmp(errors, "xmlcharrefreplace"))
8209 *known_errorHandler = 4;
8210 else
8211 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008212 }
8213 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 case 1: /* strict */
8215 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8216 return -1;
8217 case 2: /* replace */
8218 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 x = charmapencode_output('?', mapping, res, respos);
8220 if (x==enc_EXCEPTION) {
8221 return -1;
8222 }
8223 else if (x==enc_FAILED) {
8224 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8225 return -1;
8226 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 }
8228 /* fall through */
8229 case 3: /* ignore */
8230 *inpos = collendpos;
8231 break;
8232 case 4: /* xmlcharrefreplace */
8233 /* generate replacement (temporarily (mis)uses p) */
8234 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008235 char buffer[2+29+1+1];
8236 char *cp;
8237 sprintf(buffer, "&#%d;", (int)p[collpos]);
8238 for (cp = buffer; *cp; ++cp) {
8239 x = charmapencode_output(*cp, mapping, res, respos);
8240 if (x==enc_EXCEPTION)
8241 return -1;
8242 else if (x==enc_FAILED) {
8243 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8244 return -1;
8245 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008246 }
8247 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008248 *inpos = collendpos;
8249 break;
8250 default:
8251 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 encoding, reason, p, size, exceptionObject,
8253 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008254 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008255 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008256 if (PyBytes_Check(repunicode)) {
8257 /* Directly copy bytes result to output. */
8258 Py_ssize_t outsize = PyBytes_Size(*res);
8259 Py_ssize_t requiredsize;
8260 repsize = PyBytes_Size(repunicode);
8261 requiredsize = *respos + repsize;
8262 if (requiredsize > outsize)
8263 /* Make room for all additional bytes. */
8264 if (charmapencode_resize(res, respos, requiredsize)) {
8265 Py_DECREF(repunicode);
8266 return -1;
8267 }
8268 memcpy(PyBytes_AsString(*res) + *respos,
8269 PyBytes_AsString(repunicode), repsize);
8270 *respos += repsize;
8271 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008272 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008273 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008274 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008275 /* generate replacement */
8276 repsize = PyUnicode_GET_SIZE(repunicode);
8277 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 x = charmapencode_output(*uni2, mapping, res, respos);
8279 if (x==enc_EXCEPTION) {
8280 return -1;
8281 }
8282 else if (x==enc_FAILED) {
8283 Py_DECREF(repunicode);
8284 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
8285 return -1;
8286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008287 }
8288 *inpos = newpos;
8289 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008290 }
8291 return 0;
8292}
8293
Alexander Belopolsky40018472011-02-26 01:02:56 +00008294PyObject *
8295PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8296 Py_ssize_t size,
8297 PyObject *mapping,
8298 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008299{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 /* output object */
8301 PyObject *res = NULL;
8302 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008303 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008305 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 PyObject *errorHandler = NULL;
8307 PyObject *exc = NULL;
8308 /* the following variable is used for caching string comparisons
8309 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8310 * 3=ignore, 4=xmlcharrefreplace */
8311 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008312
8313 /* Default to Latin-1 */
8314 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008317 /* allocate enough for a simple encoding without
8318 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008319 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008320 if (res == NULL)
8321 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008322 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008326 /* try to encode it */
8327 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
8328 if (x==enc_EXCEPTION) /* error */
8329 goto onError;
8330 if (x==enc_FAILED) { /* unencodable character */
8331 if (charmap_encoding_error(p, size, &inpos, mapping,
8332 &exc,
8333 &known_errorHandler, &errorHandler, errors,
8334 &res, &respos)) {
8335 goto onError;
8336 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008337 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 else
8339 /* done with this character => adjust input position */
8340 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008342
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008343 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008344 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008345 if (_PyBytes_Resize(&res, respos) < 0)
8346 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008348 Py_XDECREF(exc);
8349 Py_XDECREF(errorHandler);
8350 return res;
8351
Benjamin Peterson29060642009-01-31 22:14:21 +00008352 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 Py_XDECREF(res);
8354 Py_XDECREF(exc);
8355 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008356 return NULL;
8357}
8358
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359PyObject *
8360PyUnicode_AsCharmapString(PyObject *unicode,
8361 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008362{
8363 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 PyErr_BadArgument();
8365 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366 }
8367 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00008368 PyUnicode_GET_SIZE(unicode),
8369 mapping,
8370 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008371}
8372
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008373/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008374static void
8375make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008377 Py_ssize_t startpos, Py_ssize_t endpos,
8378 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008381 *exceptionObject = _PyUnicodeTranslateError_Create(
8382 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383 }
8384 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8386 goto onError;
8387 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8388 goto onError;
8389 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8390 goto onError;
8391 return;
8392 onError:
8393 Py_DECREF(*exceptionObject);
8394 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008395 }
8396}
8397
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008398/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008399static void
8400raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008402 Py_ssize_t startpos, Py_ssize_t endpos,
8403 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404{
8405 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008406 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008409}
8410
8411/* error handling callback helper:
8412 build arguments, call the callback and check the arguments,
8413 put the result into newpos and return the replacement string, which
8414 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008415static PyObject *
8416unicode_translate_call_errorhandler(const char *errors,
8417 PyObject **errorHandler,
8418 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008419 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008420 Py_ssize_t startpos, Py_ssize_t endpos,
8421 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008423 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008425 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 PyObject *restuple;
8427 PyObject *resunicode;
8428
8429 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433 }
8434
8435 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439
8440 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008442 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008445 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008446 Py_DECREF(restuple);
8447 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 }
8449 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 &resunicode, &i_newpos)) {
8451 Py_DECREF(restuple);
8452 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008453 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008454 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008455 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008456 else
8457 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008458 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8460 Py_DECREF(restuple);
8461 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008462 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 Py_INCREF(resunicode);
8464 Py_DECREF(restuple);
8465 return resunicode;
8466}
8467
8468/* Lookup the character ch in the mapping and put the result in result,
8469 which must be decrefed by the caller.
8470 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008471static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008472charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473{
Christian Heimes217cfd12007-12-02 14:31:20 +00008474 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 PyObject *x;
8476
8477 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008479 x = PyObject_GetItem(mapping, w);
8480 Py_DECREF(w);
8481 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8483 /* No mapping found means: use 1:1 mapping. */
8484 PyErr_Clear();
8485 *result = NULL;
8486 return 0;
8487 } else
8488 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008489 }
8490 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 *result = x;
8492 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008494 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 long value = PyLong_AS_LONG(x);
8496 long max = PyUnicode_GetMax();
8497 if (value < 0 || value > max) {
8498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008499 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 Py_DECREF(x);
8501 return -1;
8502 }
8503 *result = x;
8504 return 0;
8505 }
8506 else if (PyUnicode_Check(x)) {
8507 *result = x;
8508 return 0;
8509 }
8510 else {
8511 /* wrong return value */
8512 PyErr_SetString(PyExc_TypeError,
8513 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 Py_DECREF(x);
8515 return -1;
8516 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517}
8518/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 if not reallocate and adjust various state variables.
8520 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008521static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008522charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008526 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008527 /* exponentially overallocate to minimize reallocations */
8528 if (requiredsize < 2 * oldsize)
8529 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8531 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008532 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008533 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008534 }
8535 return 0;
8536}
8537/* lookup the character, put the result in the output string and adjust
8538 various state variables. Return a new reference to the object that
8539 was put in the output buffer in *result, or Py_None, if the mapping was
8540 undefined (in which case no character was written).
8541 The called must decref result.
8542 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008543static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008544charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8545 PyObject *mapping, Py_UCS4 **output,
8546 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008547 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008549 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8550 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008552 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008553 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008554 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008555 }
8556 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008558 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008560 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 }
8562 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008563 Py_ssize_t repsize;
8564 if (PyUnicode_READY(*res) == -1)
8565 return -1;
8566 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 if (repsize==1) {
8568 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008569 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008570 }
8571 else if (repsize!=0) {
8572 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 Py_ssize_t requiredsize = *opos +
8574 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008576 Py_ssize_t i;
8577 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008578 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008579 for(i = 0; i < repsize; i++)
8580 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008582 }
8583 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008584 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 return 0;
8586}
8587
Alexander Belopolsky40018472011-02-26 01:02:56 +00008588PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008589_PyUnicode_TranslateCharmap(PyObject *input,
8590 PyObject *mapping,
8591 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008593 /* input object */
8594 char *idata;
8595 Py_ssize_t size, i;
8596 int kind;
8597 /* output buffer */
8598 Py_UCS4 *output = NULL;
8599 Py_ssize_t osize;
8600 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008602 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008603 char *reason = "character maps to <undefined>";
8604 PyObject *errorHandler = NULL;
8605 PyObject *exc = NULL;
8606 /* the following variable is used for caching string comparisons
8607 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8608 * 3=ignore, 4=xmlcharrefreplace */
8609 int known_errorHandler = -1;
8610
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 PyErr_BadArgument();
8613 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 if (PyUnicode_READY(input) == -1)
8617 return NULL;
8618 idata = (char*)PyUnicode_DATA(input);
8619 kind = PyUnicode_KIND(input);
8620 size = PyUnicode_GET_LENGTH(input);
8621 i = 0;
8622
8623 if (size == 0) {
8624 Py_INCREF(input);
8625 return input;
8626 }
8627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 /* allocate enough for a simple 1:1 translation without
8629 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 osize = size;
8631 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8632 opos = 0;
8633 if (output == NULL) {
8634 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008636 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008638 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 /* try to encode it */
8640 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008641 if (charmaptranslate_output(input, i, mapping,
8642 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008643 Py_XDECREF(x);
8644 goto onError;
8645 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008646 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008647 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 else { /* untranslatable character */
8650 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8651 Py_ssize_t repsize;
8652 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008655 Py_ssize_t collstart = i;
8656 Py_ssize_t collend = i+1;
8657 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008660 while (collend < size) {
8661 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 goto onError;
8663 Py_XDECREF(x);
8664 if (x!=Py_None)
8665 break;
8666 ++collend;
8667 }
8668 /* cache callback name lookup
8669 * (if not done yet, i.e. it's the first error) */
8670 if (known_errorHandler==-1) {
8671 if ((errors==NULL) || (!strcmp(errors, "strict")))
8672 known_errorHandler = 1;
8673 else if (!strcmp(errors, "replace"))
8674 known_errorHandler = 2;
8675 else if (!strcmp(errors, "ignore"))
8676 known_errorHandler = 3;
8677 else if (!strcmp(errors, "xmlcharrefreplace"))
8678 known_errorHandler = 4;
8679 else
8680 known_errorHandler = 0;
8681 }
8682 switch (known_errorHandler) {
8683 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008684 raise_translate_exception(&exc, input, collstart,
8685 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008686 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008687 case 2: /* replace */
8688 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 for (coll = collstart; coll<collend; coll++)
8690 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008691 /* fall through */
8692 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 break;
8695 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008696 /* generate replacement (temporarily (mis)uses i) */
8697 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 char buffer[2+29+1+1];
8699 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008700 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8701 if (charmaptranslate_makespace(&output, &osize,
8702 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008703 goto onError;
8704 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008705 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008707 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 break;
8709 default:
8710 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008711 reason, input, &exc,
8712 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008713 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 goto onError;
8715 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008716 repsize = PyUnicode_GET_LENGTH(repunicode);
8717 if (charmaptranslate_makespace(&output, &osize,
8718 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 Py_DECREF(repunicode);
8720 goto onError;
8721 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008722 for (uni2 = 0; repsize-->0; ++uni2)
8723 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8724 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008726 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008727 }
8728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8730 if (!res)
8731 goto onError;
8732 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008733 Py_XDECREF(exc);
8734 Py_XDECREF(errorHandler);
8735 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008736
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008738 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739 Py_XDECREF(exc);
8740 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008741 return NULL;
8742}
8743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744/* Deprecated. Use PyUnicode_Translate instead. */
8745PyObject *
8746PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8747 Py_ssize_t size,
8748 PyObject *mapping,
8749 const char *errors)
8750{
8751 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8752 if (!unicode)
8753 return NULL;
8754 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8755}
8756
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757PyObject *
8758PyUnicode_Translate(PyObject *str,
8759 PyObject *mapping,
8760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761{
8762 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008763
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 str = PyUnicode_FromObject(str);
8765 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008766 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 Py_DECREF(str);
8769 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008770
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772 Py_XDECREF(str);
8773 return NULL;
8774}
Tim Petersced69f82003-09-16 20:30:58 +00008775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008776static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008777fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778{
8779 /* No need to call PyUnicode_READY(self) because this function is only
8780 called as a callback from fixup() which does it already. */
8781 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8782 const int kind = PyUnicode_KIND(self);
8783 void *data = PyUnicode_DATA(self);
8784 Py_UCS4 maxchar = 0, ch, fixed;
8785 Py_ssize_t i;
8786
8787 for (i = 0; i < len; ++i) {
8788 ch = PyUnicode_READ(kind, data, i);
8789 fixed = 0;
8790 if (ch > 127) {
8791 if (Py_UNICODE_ISSPACE(ch))
8792 fixed = ' ';
8793 else {
8794 const int decimal = Py_UNICODE_TODECIMAL(ch);
8795 if (decimal >= 0)
8796 fixed = '0' + decimal;
8797 }
8798 if (fixed != 0) {
8799 if (fixed > maxchar)
8800 maxchar = fixed;
8801 PyUnicode_WRITE(kind, data, i, fixed);
8802 }
8803 else if (ch > maxchar)
8804 maxchar = ch;
8805 }
8806 else if (ch > maxchar)
8807 maxchar = ch;
8808 }
8809
8810 return maxchar;
8811}
8812
8813PyObject *
8814_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8815{
8816 if (!PyUnicode_Check(unicode)) {
8817 PyErr_BadInternalCall();
8818 return NULL;
8819 }
8820 if (PyUnicode_READY(unicode) == -1)
8821 return NULL;
8822 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8823 /* If the string is already ASCII, just return the same string */
8824 Py_INCREF(unicode);
8825 return unicode;
8826 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008827 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828}
8829
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008830PyObject *
8831PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8832 Py_ssize_t length)
8833{
8834 PyObject *result;
8835 Py_UNICODE *p; /* write pointer into result */
8836 Py_ssize_t i;
8837 /* Copy to a new string */
8838 result = (PyObject *)_PyUnicode_New(length);
8839 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8840 if (result == NULL)
8841 return result;
8842 p = PyUnicode_AS_UNICODE(result);
8843 /* Iterate over code points */
8844 for (i = 0; i < length; i++) {
8845 Py_UNICODE ch =s[i];
8846 if (ch > 127) {
8847 int decimal = Py_UNICODE_TODECIMAL(ch);
8848 if (decimal >= 0)
8849 p[i] = '0' + decimal;
8850 }
8851 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008852#ifndef DONT_MAKE_RESULT_READY
8853 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008854 Py_DECREF(result);
8855 return NULL;
8856 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008857#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008858 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008859 return result;
8860}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008861/* --- Decimal Encoder ---------------------------------------------------- */
8862
Alexander Belopolsky40018472011-02-26 01:02:56 +00008863int
8864PyUnicode_EncodeDecimal(Py_UNICODE *s,
8865 Py_ssize_t length,
8866 char *output,
8867 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008868{
8869 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 PyObject *errorHandler = NULL;
8871 PyObject *exc = NULL;
8872 const char *encoding = "decimal";
8873 const char *reason = "invalid decimal Unicode string";
8874 /* the following variable is used for caching string comparisons
8875 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8876 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008877
8878 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008879 PyErr_BadArgument();
8880 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008881 }
8882
8883 p = s;
8884 end = s + length;
8885 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 register Py_UNICODE ch = *p;
8887 int decimal;
8888 PyObject *repunicode;
8889 Py_ssize_t repsize;
8890 Py_ssize_t newpos;
8891 Py_UNICODE *uni2;
8892 Py_UNICODE *collstart;
8893 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008894
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008896 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 ++p;
8898 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008899 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 decimal = Py_UNICODE_TODECIMAL(ch);
8901 if (decimal >= 0) {
8902 *output++ = '0' + decimal;
8903 ++p;
8904 continue;
8905 }
8906 if (0 < ch && ch < 256) {
8907 *output++ = (char)ch;
8908 ++p;
8909 continue;
8910 }
8911 /* All other characters are considered unencodable */
8912 collstart = p;
8913 collend = p+1;
8914 while (collend < end) {
8915 if ((0 < *collend && *collend < 256) ||
8916 !Py_UNICODE_ISSPACE(*collend) ||
8917 Py_UNICODE_TODECIMAL(*collend))
8918 break;
8919 }
8920 /* cache callback name lookup
8921 * (if not done yet, i.e. it's the first error) */
8922 if (known_errorHandler==-1) {
8923 if ((errors==NULL) || (!strcmp(errors, "strict")))
8924 known_errorHandler = 1;
8925 else if (!strcmp(errors, "replace"))
8926 known_errorHandler = 2;
8927 else if (!strcmp(errors, "ignore"))
8928 known_errorHandler = 3;
8929 else if (!strcmp(errors, "xmlcharrefreplace"))
8930 known_errorHandler = 4;
8931 else
8932 known_errorHandler = 0;
8933 }
8934 switch (known_errorHandler) {
8935 case 1: /* strict */
8936 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8937 goto onError;
8938 case 2: /* replace */
8939 for (p = collstart; p < collend; ++p)
8940 *output++ = '?';
8941 /* fall through */
8942 case 3: /* ignore */
8943 p = collend;
8944 break;
8945 case 4: /* xmlcharrefreplace */
8946 /* generate replacement (temporarily (mis)uses p) */
8947 for (p = collstart; p < collend; ++p)
8948 output += sprintf(output, "&#%d;", (int)*p);
8949 p = collend;
8950 break;
8951 default:
8952 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8953 encoding, reason, s, length, &exc,
8954 collstart-s, collend-s, &newpos);
8955 if (repunicode == NULL)
8956 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008957 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008958 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008959 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8960 Py_DECREF(repunicode);
8961 goto onError;
8962 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008963 /* generate replacement */
8964 repsize = PyUnicode_GET_SIZE(repunicode);
8965 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8966 Py_UNICODE ch = *uni2;
8967 if (Py_UNICODE_ISSPACE(ch))
8968 *output++ = ' ';
8969 else {
8970 decimal = Py_UNICODE_TODECIMAL(ch);
8971 if (decimal >= 0)
8972 *output++ = '0' + decimal;
8973 else if (0 < ch && ch < 256)
8974 *output++ = (char)ch;
8975 else {
8976 Py_DECREF(repunicode);
8977 raise_encode_exception(&exc, encoding,
8978 s, length, collstart-s, collend-s, reason);
8979 goto onError;
8980 }
8981 }
8982 }
8983 p = s + newpos;
8984 Py_DECREF(repunicode);
8985 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008986 }
8987 /* 0-terminate the output string */
8988 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008989 Py_XDECREF(exc);
8990 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008991 return 0;
8992
Benjamin Peterson29060642009-01-31 22:14:21 +00008993 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008994 Py_XDECREF(exc);
8995 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008996 return -1;
8997}
8998
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999/* --- Helpers ------------------------------------------------------------ */
9000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02009002any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 Py_ssize_t start,
9004 Py_ssize_t end)
9005{
9006 int kind1, kind2, kind;
9007 void *buf1, *buf2;
9008 Py_ssize_t len1, len2, result;
9009
9010 kind1 = PyUnicode_KIND(s1);
9011 kind2 = PyUnicode_KIND(s2);
9012 kind = kind1 > kind2 ? kind1 : kind2;
9013 buf1 = PyUnicode_DATA(s1);
9014 buf2 = PyUnicode_DATA(s2);
9015 if (kind1 != kind)
9016 buf1 = _PyUnicode_AsKind(s1, kind);
9017 if (!buf1)
9018 return -2;
9019 if (kind2 != kind)
9020 buf2 = _PyUnicode_AsKind(s2, kind);
9021 if (!buf2) {
9022 if (kind1 != kind) PyMem_Free(buf1);
9023 return -2;
9024 }
9025 len1 = PyUnicode_GET_LENGTH(s1);
9026 len2 = PyUnicode_GET_LENGTH(s2);
9027
Victor Stinner794d5672011-10-10 03:21:36 +02009028 if (direction > 0) {
9029 switch(kind) {
9030 case PyUnicode_1BYTE_KIND:
9031 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9032 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9033 else
9034 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9035 break;
9036 case PyUnicode_2BYTE_KIND:
9037 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9038 break;
9039 case PyUnicode_4BYTE_KIND:
9040 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9041 break;
9042 default:
9043 assert(0); result = -2;
9044 }
9045 }
9046 else {
9047 switch(kind) {
9048 case PyUnicode_1BYTE_KIND:
9049 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9050 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9051 else
9052 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9053 break;
9054 case PyUnicode_2BYTE_KIND:
9055 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9056 break;
9057 case PyUnicode_4BYTE_KIND:
9058 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9059 break;
9060 default:
9061 assert(0); result = -2;
9062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009063 }
9064
9065 if (kind1 != kind)
9066 PyMem_Free(buf1);
9067 if (kind2 != kind)
9068 PyMem_Free(buf2);
9069
9070 return result;
9071}
9072
9073Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009074_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 Py_ssize_t n_buffer,
9076 void *digits, Py_ssize_t n_digits,
9077 Py_ssize_t min_width,
9078 const char *grouping,
9079 const char *thousands_sep)
9080{
9081 switch(kind) {
9082 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009083 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9084 return _PyUnicode_ascii_InsertThousandsGrouping(
9085 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9086 min_width, grouping, thousands_sep);
9087 else
9088 return _PyUnicode_ucs1_InsertThousandsGrouping(
9089 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9090 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 case PyUnicode_2BYTE_KIND:
9092 return _PyUnicode_ucs2_InsertThousandsGrouping(
9093 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
9094 min_width, grouping, thousands_sep);
9095 case PyUnicode_4BYTE_KIND:
9096 return _PyUnicode_ucs4_InsertThousandsGrouping(
9097 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
9098 min_width, grouping, thousands_sep);
9099 }
9100 assert(0);
9101 return -1;
9102}
9103
9104
Thomas Wouters477c8d52006-05-27 19:21:47 +00009105/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009106#define ADJUST_INDICES(start, end, len) \
9107 if (end > len) \
9108 end = len; \
9109 else if (end < 0) { \
9110 end += len; \
9111 if (end < 0) \
9112 end = 0; \
9113 } \
9114 if (start < 0) { \
9115 start += len; \
9116 if (start < 0) \
9117 start = 0; \
9118 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009119
Alexander Belopolsky40018472011-02-26 01:02:56 +00009120Py_ssize_t
9121PyUnicode_Count(PyObject *str,
9122 PyObject *substr,
9123 Py_ssize_t start,
9124 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009125{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009126 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009127 PyUnicodeObject* str_obj;
9128 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 int kind1, kind2, kind;
9130 void *buf1 = NULL, *buf2 = NULL;
9131 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009132
Thomas Wouters477c8d52006-05-27 19:21:47 +00009133 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009134 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009135 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009136 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009137 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 Py_DECREF(str_obj);
9139 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 }
Tim Petersced69f82003-09-16 20:30:58 +00009141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 kind1 = PyUnicode_KIND(str_obj);
9143 kind2 = PyUnicode_KIND(sub_obj);
9144 kind = kind1 > kind2 ? kind1 : kind2;
9145 buf1 = PyUnicode_DATA(str_obj);
9146 if (kind1 != kind)
9147 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
9148 if (!buf1)
9149 goto onError;
9150 buf2 = PyUnicode_DATA(sub_obj);
9151 if (kind2 != kind)
9152 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
9153 if (!buf2)
9154 goto onError;
9155 len1 = PyUnicode_GET_LENGTH(str_obj);
9156 len2 = PyUnicode_GET_LENGTH(sub_obj);
9157
9158 ADJUST_INDICES(start, end, len1);
9159 switch(kind) {
9160 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009161 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9162 result = asciilib_count(
9163 ((Py_UCS1*)buf1) + start, end - start,
9164 buf2, len2, PY_SSIZE_T_MAX
9165 );
9166 else
9167 result = ucs1lib_count(
9168 ((Py_UCS1*)buf1) + start, end - start,
9169 buf2, len2, PY_SSIZE_T_MAX
9170 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 break;
9172 case PyUnicode_2BYTE_KIND:
9173 result = ucs2lib_count(
9174 ((Py_UCS2*)buf1) + start, end - start,
9175 buf2, len2, PY_SSIZE_T_MAX
9176 );
9177 break;
9178 case PyUnicode_4BYTE_KIND:
9179 result = ucs4lib_count(
9180 ((Py_UCS4*)buf1) + start, end - start,
9181 buf2, len2, PY_SSIZE_T_MAX
9182 );
9183 break;
9184 default:
9185 assert(0); result = 0;
9186 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009187
9188 Py_DECREF(sub_obj);
9189 Py_DECREF(str_obj);
9190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 if (kind1 != kind)
9192 PyMem_Free(buf1);
9193 if (kind2 != kind)
9194 PyMem_Free(buf2);
9195
Guido van Rossumd57fd912000-03-10 22:53:23 +00009196 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 onError:
9198 Py_DECREF(sub_obj);
9199 Py_DECREF(str_obj);
9200 if (kind1 != kind && buf1)
9201 PyMem_Free(buf1);
9202 if (kind2 != kind && buf2)
9203 PyMem_Free(buf2);
9204 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205}
9206
Alexander Belopolsky40018472011-02-26 01:02:56 +00009207Py_ssize_t
9208PyUnicode_Find(PyObject *str,
9209 PyObject *sub,
9210 Py_ssize_t start,
9211 Py_ssize_t end,
9212 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009214 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009215
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009220 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 Py_DECREF(str);
9222 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009223 }
Tim Petersced69f82003-09-16 20:30:58 +00009224
Victor Stinner794d5672011-10-10 03:21:36 +02009225 result = any_find_slice(direction,
9226 str, sub, start, end
9227 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009228
Guido van Rossumd57fd912000-03-10 22:53:23 +00009229 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009230 Py_DECREF(sub);
9231
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 return result;
9233}
9234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009235Py_ssize_t
9236PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9237 Py_ssize_t start, Py_ssize_t end,
9238 int direction)
9239{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009241 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 if (PyUnicode_READY(str) == -1)
9243 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009244 if (start < 0 || end < 0) {
9245 PyErr_SetString(PyExc_IndexError, "string index out of range");
9246 return -2;
9247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009248 if (end > PyUnicode_GET_LENGTH(str))
9249 end = PyUnicode_GET_LENGTH(str);
9250 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009251 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9252 kind, end-start, ch, direction);
9253 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009254 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009255 else
9256 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257}
9258
Alexander Belopolsky40018472011-02-26 01:02:56 +00009259static int
9260tailmatch(PyUnicodeObject *self,
9261 PyUnicodeObject *substring,
9262 Py_ssize_t start,
9263 Py_ssize_t end,
9264 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 int kind_self;
9267 int kind_sub;
9268 void *data_self;
9269 void *data_sub;
9270 Py_ssize_t offset;
9271 Py_ssize_t i;
9272 Py_ssize_t end_sub;
9273
9274 if (PyUnicode_READY(self) == -1 ||
9275 PyUnicode_READY(substring) == -1)
9276 return 0;
9277
9278 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009279 return 1;
9280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9282 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009283 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009284 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 kind_self = PyUnicode_KIND(self);
9287 data_self = PyUnicode_DATA(self);
9288 kind_sub = PyUnicode_KIND(substring);
9289 data_sub = PyUnicode_DATA(substring);
9290 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9291
9292 if (direction > 0)
9293 offset = end;
9294 else
9295 offset = start;
9296
9297 if (PyUnicode_READ(kind_self, data_self, offset) ==
9298 PyUnicode_READ(kind_sub, data_sub, 0) &&
9299 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9300 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9301 /* If both are of the same kind, memcmp is sufficient */
9302 if (kind_self == kind_sub) {
9303 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009304 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 data_sub,
9306 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009307 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 }
9309 /* otherwise we have to compare each character by first accesing it */
9310 else {
9311 /* We do not need to compare 0 and len(substring)-1 because
9312 the if statement above ensured already that they are equal
9313 when we end up here. */
9314 // TODO: honor direction and do a forward or backwards search
9315 for (i = 1; i < end_sub; ++i) {
9316 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9317 PyUnicode_READ(kind_sub, data_sub, i))
9318 return 0;
9319 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009320 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 }
9323
9324 return 0;
9325}
9326
Alexander Belopolsky40018472011-02-26 01:02:56 +00009327Py_ssize_t
9328PyUnicode_Tailmatch(PyObject *str,
9329 PyObject *substr,
9330 Py_ssize_t start,
9331 Py_ssize_t end,
9332 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009334 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009335
Guido van Rossumd57fd912000-03-10 22:53:23 +00009336 str = PyUnicode_FromObject(str);
9337 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339 substr = PyUnicode_FromObject(substr);
9340 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 Py_DECREF(str);
9342 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343 }
Tim Petersced69f82003-09-16 20:30:58 +00009344
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00009346 (PyUnicodeObject *)substr,
9347 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 Py_DECREF(str);
9349 Py_DECREF(substr);
9350 return result;
9351}
9352
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353/* Apply fixfct filter to the Unicode object self and return a
9354 reference to the modified object */
9355
Alexander Belopolsky40018472011-02-26 01:02:56 +00009356static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009357fixup(PyObject *self,
9358 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009359{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009360 PyObject *u;
9361 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 if (PyUnicode_READY(self) == -1)
9364 return NULL;
9365 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
9366 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
9367 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009372 PyUnicode_GET_LENGTH(u) * PyUnicode_KIND(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 /* fix functions return the new maximum character in a string,
9375 if the kind of the resulting unicode object does not change,
9376 everything is fine. Otherwise we need to change the string kind
9377 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009378 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009379 if (maxchar_new == 0)
9380 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9381 else if (maxchar_new <= 127)
9382 maxchar_new = 127;
9383 else if (maxchar_new <= 255)
9384 maxchar_new = 255;
9385 else if (maxchar_new <= 65535)
9386 maxchar_new = 65535;
9387 else
9388 maxchar_new = 1114111; /* 0x10ffff */
9389
9390 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009391 /* fixfct should return TRUE if it modified the buffer. If
9392 FALSE, return a reference to the original buffer instead
9393 (to save space, not time) */
9394 Py_INCREF(self);
9395 Py_DECREF(u);
9396 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009397 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 else if (maxchar_new == maxchar_old) {
9399 return u;
9400 }
9401 else {
9402 /* In case the maximum character changed, we need to
9403 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009404 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009405 if (v == NULL) {
9406 Py_DECREF(u);
9407 return NULL;
9408 }
9409 if (maxchar_new > maxchar_old) {
9410 /* If the maxchar increased so that the kind changed, not all
9411 characters are representable anymore and we need to fix the
9412 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009413 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009414 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9416 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009417 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009418 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420
9421 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009422 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 return v;
9424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009425}
9426
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009428fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009429{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 /* No need to call PyUnicode_READY(self) because this function is only
9431 called as a callback from fixup() which does it already. */
9432 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9433 const int kind = PyUnicode_KIND(self);
9434 void *data = PyUnicode_DATA(self);
9435 int touched = 0;
9436 Py_UCS4 maxchar = 0;
9437 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 for (i = 0; i < len; ++i) {
9440 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9441 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9442 if (up != ch) {
9443 if (up > maxchar)
9444 maxchar = up;
9445 PyUnicode_WRITE(kind, data, i, up);
9446 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009448 else if (ch > maxchar)
9449 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009450 }
9451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009452 if (touched)
9453 return maxchar;
9454 else
9455 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009456}
9457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009459fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009460{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9462 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9463 const int kind = PyUnicode_KIND(self);
9464 void *data = PyUnicode_DATA(self);
9465 int touched = 0;
9466 Py_UCS4 maxchar = 0;
9467 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469 for(i = 0; i < len; ++i) {
9470 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9471 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9472 if (lo != ch) {
9473 if (lo > maxchar)
9474 maxchar = lo;
9475 PyUnicode_WRITE(kind, data, i, lo);
9476 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009478 else if (ch > maxchar)
9479 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 }
9481
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 if (touched)
9483 return maxchar;
9484 else
9485 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486}
9487
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009488static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009489fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009490{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9492 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9493 const int kind = PyUnicode_KIND(self);
9494 void *data = PyUnicode_DATA(self);
9495 int touched = 0;
9496 Py_UCS4 maxchar = 0;
9497 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009499 for(i = 0; i < len; ++i) {
9500 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9501 Py_UCS4 nu = 0;
9502
9503 if (Py_UNICODE_ISUPPER(ch))
9504 nu = Py_UNICODE_TOLOWER(ch);
9505 else if (Py_UNICODE_ISLOWER(ch))
9506 nu = Py_UNICODE_TOUPPER(ch);
9507
9508 if (nu != 0) {
9509 if (nu > maxchar)
9510 maxchar = nu;
9511 PyUnicode_WRITE(kind, data, i, nu);
9512 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009513 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 else if (ch > maxchar)
9515 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009516 }
9517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 if (touched)
9519 return maxchar;
9520 else
9521 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009522}
9523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009525fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009526{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9528 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9529 const int kind = PyUnicode_KIND(self);
9530 void *data = PyUnicode_DATA(self);
9531 int touched = 0;
9532 Py_UCS4 maxchar = 0;
9533 Py_ssize_t i = 0;
9534 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009535
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009536 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009537 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538
9539 ch = PyUnicode_READ(kind, data, i);
9540 if (!Py_UNICODE_ISUPPER(ch)) {
9541 maxchar = Py_UNICODE_TOUPPER(ch);
9542 PyUnicode_WRITE(kind, data, i, maxchar);
9543 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 ++i;
9546 for(; i < len; ++i) {
9547 ch = PyUnicode_READ(kind, data, i);
9548 if (!Py_UNICODE_ISLOWER(ch)) {
9549 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9550 if (lo > maxchar)
9551 maxchar = lo;
9552 PyUnicode_WRITE(kind, data, i, lo);
9553 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 else if (ch > maxchar)
9556 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558
9559 if (touched)
9560 return maxchar;
9561 else
9562 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563}
9564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009565static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009566fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009567{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9569 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9570 const int kind = PyUnicode_KIND(self);
9571 void *data = PyUnicode_DATA(self);
9572 Py_UCS4 maxchar = 0;
9573 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574 int previous_is_cased;
9575
9576 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577 if (len == 1) {
9578 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9579 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9580 if (ti != ch) {
9581 PyUnicode_WRITE(kind, data, i, ti);
9582 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 }
9584 else
9585 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 for(; i < len; ++i) {
9589 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9590 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009591
Benjamin Peterson29060642009-01-31 22:14:21 +00009592 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 nu = Py_UNICODE_TOTITLE(ch);
9596
9597 if (nu > maxchar)
9598 maxchar = nu;
9599 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009600
Benjamin Peterson29060642009-01-31 22:14:21 +00009601 if (Py_UNICODE_ISLOWER(ch) ||
9602 Py_UNICODE_ISUPPER(ch) ||
9603 Py_UNICODE_ISTITLE(ch))
9604 previous_is_cased = 1;
9605 else
9606 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009607 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609}
9610
Tim Peters8ce9f162004-08-27 01:49:32 +00009611PyObject *
9612PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009615 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009617 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009618 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9619 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009620 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009622 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009624 int use_memcpy;
9625 unsigned char *res_data = NULL, *sep_data = NULL;
9626 PyObject *last_obj;
9627 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Tim Peters05eba1f2004-08-27 21:32:02 +00009629 fseq = PySequence_Fast(seq, "");
9630 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009631 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009632 }
9633
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009634 /* NOTE: the following code can't call back into Python code,
9635 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009636 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009637
Tim Peters05eba1f2004-08-27 21:32:02 +00009638 seqlen = PySequence_Fast_GET_SIZE(fseq);
9639 /* If empty sequence, return u"". */
9640 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009641 Py_DECREF(fseq);
9642 Py_INCREF(unicode_empty);
9643 res = unicode_empty;
9644 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009645 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009646
Tim Peters05eba1f2004-08-27 21:32:02 +00009647 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009648 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009649 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009650 if (seqlen == 1) {
9651 if (PyUnicode_CheckExact(items[0])) {
9652 res = items[0];
9653 Py_INCREF(res);
9654 Py_DECREF(fseq);
9655 return res;
9656 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009657 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009658 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009659 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009660 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009661 /* Set up sep and seplen */
9662 if (separator == NULL) {
9663 /* fall back to a blank space separator */
9664 sep = PyUnicode_FromOrdinal(' ');
9665 if (!sep)
9666 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009667 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009668 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009669 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009670 else {
9671 if (!PyUnicode_Check(separator)) {
9672 PyErr_Format(PyExc_TypeError,
9673 "separator: expected str instance,"
9674 " %.80s found",
9675 Py_TYPE(separator)->tp_name);
9676 goto onError;
9677 }
9678 if (PyUnicode_READY(separator))
9679 goto onError;
9680 sep = separator;
9681 seplen = PyUnicode_GET_LENGTH(separator);
9682 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9683 /* inc refcount to keep this code path symmetric with the
9684 above case of a blank separator */
9685 Py_INCREF(sep);
9686 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009687 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009688 }
9689
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009690 /* There are at least two things to join, or else we have a subclass
9691 * of str in the sequence.
9692 * Do a pre-pass to figure out the total amount of space we'll
9693 * need (sz), and see whether all argument are strings.
9694 */
9695 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009696#ifdef Py_DEBUG
9697 use_memcpy = 0;
9698#else
9699 use_memcpy = 1;
9700#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009701 for (i = 0; i < seqlen; i++) {
9702 const Py_ssize_t old_sz = sz;
9703 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009704 if (!PyUnicode_Check(item)) {
9705 PyErr_Format(PyExc_TypeError,
9706 "sequence item %zd: expected str instance,"
9707 " %.80s found",
9708 i, Py_TYPE(item)->tp_name);
9709 goto onError;
9710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 if (PyUnicode_READY(item) == -1)
9712 goto onError;
9713 sz += PyUnicode_GET_LENGTH(item);
9714 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009715 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009716 if (i != 0)
9717 sz += seplen;
9718 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9719 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009720 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009721 goto onError;
9722 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009723 if (use_memcpy && last_obj != NULL) {
9724 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9725 use_memcpy = 0;
9726 }
9727 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009728 }
Tim Petersced69f82003-09-16 20:30:58 +00009729
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009730 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 if (res == NULL)
9732 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009733
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009734 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009735#ifdef Py_DEBUG
9736 use_memcpy = 0;
9737#else
9738 if (use_memcpy) {
9739 res_data = PyUnicode_1BYTE_DATA(res);
9740 kind = PyUnicode_KIND(res);
9741 if (seplen != 0)
9742 sep_data = PyUnicode_1BYTE_DATA(sep);
9743 }
9744#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009745 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009746 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009747 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009748 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009749 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009750 if (use_memcpy) {
9751 Py_MEMCPY(res_data,
9752 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009753 kind * seplen);
9754 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009755 }
9756 else {
9757 copy_characters(res, res_offset, sep, 0, seplen);
9758 res_offset += seplen;
9759 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009760 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009761 itemlen = PyUnicode_GET_LENGTH(item);
9762 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009763 if (use_memcpy) {
9764 Py_MEMCPY(res_data,
9765 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009766 kind * itemlen);
9767 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009768 }
9769 else {
9770 copy_characters(res, res_offset, item, 0, itemlen);
9771 res_offset += itemlen;
9772 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009773 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009774 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009775 if (use_memcpy)
9776 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009777 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009778 else
9779 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009780
Tim Peters05eba1f2004-08-27 21:32:02 +00009781 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009783 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009784 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009787 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009788 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009789 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790 return NULL;
9791}
9792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009793#define FILL(kind, data, value, start, length) \
9794 do { \
9795 Py_ssize_t i_ = 0; \
9796 assert(kind != PyUnicode_WCHAR_KIND); \
9797 switch ((kind)) { \
9798 case PyUnicode_1BYTE_KIND: { \
9799 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9800 memset(to_, (unsigned char)value, length); \
9801 break; \
9802 } \
9803 case PyUnicode_2BYTE_KIND: { \
9804 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9805 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9806 break; \
9807 } \
9808 default: { \
9809 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9810 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9811 break; \
9812 } \
9813 } \
9814 } while (0)
9815
Victor Stinner9310abb2011-10-05 00:59:23 +02009816static PyObject *
9817pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009818 Py_ssize_t left,
9819 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009821{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009822 PyObject *u;
9823 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009824 int kind;
9825 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826
9827 if (left < 0)
9828 left = 0;
9829 if (right < 0)
9830 right = 0;
9831
Tim Peters7a29bd52001-09-12 03:03:31 +00009832 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009833 Py_INCREF(self);
9834 return self;
9835 }
9836
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9838 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009839 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9840 return NULL;
9841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9843 if (fill > maxchar)
9844 maxchar = fill;
9845 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009846 if (!u)
9847 return NULL;
9848
9849 kind = PyUnicode_KIND(u);
9850 data = PyUnicode_DATA(u);
9851 if (left)
9852 FILL(kind, data, fill, 0, left);
9853 if (right)
9854 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009855 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009856 assert(_PyUnicode_CheckConsistency(u, 1));
9857 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009858}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860
Alexander Belopolsky40018472011-02-26 01:02:56 +00009861PyObject *
9862PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865
9866 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009868 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009869
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 switch(PyUnicode_KIND(string)) {
9871 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009872 if (PyUnicode_IS_ASCII(string))
9873 list = asciilib_splitlines(
9874 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9875 PyUnicode_GET_LENGTH(string), keepends);
9876 else
9877 list = ucs1lib_splitlines(
9878 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9879 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 break;
9881 case PyUnicode_2BYTE_KIND:
9882 list = ucs2lib_splitlines(
9883 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9884 PyUnicode_GET_LENGTH(string), keepends);
9885 break;
9886 case PyUnicode_4BYTE_KIND:
9887 list = ucs4lib_splitlines(
9888 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9889 PyUnicode_GET_LENGTH(string), keepends);
9890 break;
9891 default:
9892 assert(0);
9893 list = 0;
9894 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009895 Py_DECREF(string);
9896 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009897}
9898
Alexander Belopolsky40018472011-02-26 01:02:56 +00009899static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009900split(PyObject *self,
9901 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009902 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009903{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009904 int kind1, kind2, kind;
9905 void *buf1, *buf2;
9906 Py_ssize_t len1, len2;
9907 PyObject* out;
9908
Guido van Rossumd57fd912000-03-10 22:53:23 +00009909 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009910 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 if (PyUnicode_READY(self) == -1)
9913 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009915 if (substring == NULL)
9916 switch(PyUnicode_KIND(self)) {
9917 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009918 if (PyUnicode_IS_ASCII(self))
9919 return asciilib_split_whitespace(
9920 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9921 PyUnicode_GET_LENGTH(self), maxcount
9922 );
9923 else
9924 return ucs1lib_split_whitespace(
9925 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9926 PyUnicode_GET_LENGTH(self), maxcount
9927 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 case PyUnicode_2BYTE_KIND:
9929 return ucs2lib_split_whitespace(
9930 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9931 PyUnicode_GET_LENGTH(self), maxcount
9932 );
9933 case PyUnicode_4BYTE_KIND:
9934 return ucs4lib_split_whitespace(
9935 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9936 PyUnicode_GET_LENGTH(self), maxcount
9937 );
9938 default:
9939 assert(0);
9940 return NULL;
9941 }
9942
9943 if (PyUnicode_READY(substring) == -1)
9944 return NULL;
9945
9946 kind1 = PyUnicode_KIND(self);
9947 kind2 = PyUnicode_KIND(substring);
9948 kind = kind1 > kind2 ? kind1 : kind2;
9949 buf1 = PyUnicode_DATA(self);
9950 buf2 = PyUnicode_DATA(substring);
9951 if (kind1 != kind)
9952 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9953 if (!buf1)
9954 return NULL;
9955 if (kind2 != kind)
9956 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9957 if (!buf2) {
9958 if (kind1 != kind) PyMem_Free(buf1);
9959 return NULL;
9960 }
9961 len1 = PyUnicode_GET_LENGTH(self);
9962 len2 = PyUnicode_GET_LENGTH(substring);
9963
9964 switch(kind) {
9965 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009966 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9967 out = asciilib_split(
9968 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9969 else
9970 out = ucs1lib_split(
9971 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 break;
9973 case PyUnicode_2BYTE_KIND:
9974 out = ucs2lib_split(
9975 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9976 break;
9977 case PyUnicode_4BYTE_KIND:
9978 out = ucs4lib_split(
9979 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9980 break;
9981 default:
9982 out = NULL;
9983 }
9984 if (kind1 != kind)
9985 PyMem_Free(buf1);
9986 if (kind2 != kind)
9987 PyMem_Free(buf2);
9988 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009989}
9990
Alexander Belopolsky40018472011-02-26 01:02:56 +00009991static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009992rsplit(PyObject *self,
9993 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009994 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 int kind1, kind2, kind;
9997 void *buf1, *buf2;
9998 Py_ssize_t len1, len2;
9999 PyObject* out;
10000
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010001 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010002 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 if (PyUnicode_READY(self) == -1)
10005 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010007 if (substring == NULL)
10008 switch(PyUnicode_KIND(self)) {
10009 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010010 if (PyUnicode_IS_ASCII(self))
10011 return asciilib_rsplit_whitespace(
10012 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10013 PyUnicode_GET_LENGTH(self), maxcount
10014 );
10015 else
10016 return ucs1lib_rsplit_whitespace(
10017 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
10018 PyUnicode_GET_LENGTH(self), maxcount
10019 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 case PyUnicode_2BYTE_KIND:
10021 return ucs2lib_rsplit_whitespace(
10022 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
10023 PyUnicode_GET_LENGTH(self), maxcount
10024 );
10025 case PyUnicode_4BYTE_KIND:
10026 return ucs4lib_rsplit_whitespace(
10027 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
10028 PyUnicode_GET_LENGTH(self), maxcount
10029 );
10030 default:
10031 assert(0);
10032 return NULL;
10033 }
10034
10035 if (PyUnicode_READY(substring) == -1)
10036 return NULL;
10037
10038 kind1 = PyUnicode_KIND(self);
10039 kind2 = PyUnicode_KIND(substring);
10040 kind = kind1 > kind2 ? kind1 : kind2;
10041 buf1 = PyUnicode_DATA(self);
10042 buf2 = PyUnicode_DATA(substring);
10043 if (kind1 != kind)
10044 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10045 if (!buf1)
10046 return NULL;
10047 if (kind2 != kind)
10048 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10049 if (!buf2) {
10050 if (kind1 != kind) PyMem_Free(buf1);
10051 return NULL;
10052 }
10053 len1 = PyUnicode_GET_LENGTH(self);
10054 len2 = PyUnicode_GET_LENGTH(substring);
10055
10056 switch(kind) {
10057 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010058 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10059 out = asciilib_rsplit(
10060 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10061 else
10062 out = ucs1lib_rsplit(
10063 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 break;
10065 case PyUnicode_2BYTE_KIND:
10066 out = ucs2lib_rsplit(
10067 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10068 break;
10069 case PyUnicode_4BYTE_KIND:
10070 out = ucs4lib_rsplit(
10071 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
10072 break;
10073 default:
10074 out = NULL;
10075 }
10076 if (kind1 != kind)
10077 PyMem_Free(buf1);
10078 if (kind2 != kind)
10079 PyMem_Free(buf2);
10080 return out;
10081}
10082
10083static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010084anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10085 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086{
10087 switch(kind) {
10088 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010089 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10090 return asciilib_find(buf1, len1, buf2, len2, offset);
10091 else
10092 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010093 case PyUnicode_2BYTE_KIND:
10094 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10095 case PyUnicode_4BYTE_KIND:
10096 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10097 }
10098 assert(0);
10099 return -1;
10100}
10101
10102static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010103anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10104 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105{
10106 switch(kind) {
10107 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010108 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10109 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10110 else
10111 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 case PyUnicode_2BYTE_KIND:
10113 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10114 case PyUnicode_4BYTE_KIND:
10115 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10116 }
10117 assert(0);
10118 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010119}
10120
Alexander Belopolsky40018472011-02-26 01:02:56 +000010121static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122replace(PyObject *self, PyObject *str1,
10123 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010125 PyObject *u;
10126 char *sbuf = PyUnicode_DATA(self);
10127 char *buf1 = PyUnicode_DATA(str1);
10128 char *buf2 = PyUnicode_DATA(str2);
10129 int srelease = 0, release1 = 0, release2 = 0;
10130 int skind = PyUnicode_KIND(self);
10131 int kind1 = PyUnicode_KIND(str1);
10132 int kind2 = PyUnicode_KIND(str2);
10133 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10134 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10135 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010136 int mayshrink;
10137 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138
10139 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010140 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010141 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010142 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010143
Victor Stinner59de0ee2011-10-07 10:01:28 +020010144 if (str1 == str2)
10145 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010146 if (skind < kind1)
10147 /* substring too wide to be present */
10148 goto nothing;
10149
Victor Stinner49a0a212011-10-12 23:46:10 +020010150 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10151 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10152 /* Replacing str1 with str2 may cause a maxchar reduction in the
10153 result string. */
10154 mayshrink = (maxchar_str2 < maxchar);
10155 maxchar = Py_MAX(maxchar, maxchar_str2);
10156
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010157 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010158 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010159 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010161 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010163 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010164 Py_UCS4 u1, u2;
10165 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010167 if (findchar(sbuf, PyUnicode_KIND(self),
10168 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010169 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010172 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010174 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 rkind = PyUnicode_KIND(u);
10176 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10177 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010178 if (--maxcount < 0)
10179 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010181 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010182 }
10183 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 int rkind = skind;
10185 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 if (kind1 < rkind) {
10188 /* widen substring */
10189 buf1 = _PyUnicode_AsKind(str1, rkind);
10190 if (!buf1) goto error;
10191 release1 = 1;
10192 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010193 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010194 if (i < 0)
10195 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 if (rkind > kind2) {
10197 /* widen replacement */
10198 buf2 = _PyUnicode_AsKind(str2, rkind);
10199 if (!buf2) goto error;
10200 release2 = 1;
10201 }
10202 else if (rkind < kind2) {
10203 /* widen self and buf1 */
10204 rkind = kind2;
10205 if (release1) PyMem_Free(buf1);
10206 sbuf = _PyUnicode_AsKind(self, rkind);
10207 if (!sbuf) goto error;
10208 srelease = 1;
10209 buf1 = _PyUnicode_AsKind(str1, rkind);
10210 if (!buf1) goto error;
10211 release1 = 1;
10212 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010213 u = PyUnicode_New(slen, maxchar);
10214 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010216 assert(PyUnicode_KIND(u) == rkind);
10217 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010218
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010219 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010220 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010221 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010223 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010225
10226 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010228 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010229 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010230 if (i == -1)
10231 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010232 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010234 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010238 }
10239 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010240 Py_ssize_t n, i, j, ires;
10241 Py_ssize_t product, new_size;
10242 int rkind = skind;
10243 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010246 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010247 buf1 = _PyUnicode_AsKind(str1, rkind);
10248 if (!buf1) goto error;
10249 release1 = 1;
10250 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010252 if (n == 0)
10253 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010255 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 buf2 = _PyUnicode_AsKind(str2, rkind);
10257 if (!buf2) goto error;
10258 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010259 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010260 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010261 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 rkind = kind2;
10263 sbuf = _PyUnicode_AsKind(self, rkind);
10264 if (!sbuf) goto error;
10265 srelease = 1;
10266 if (release1) PyMem_Free(buf1);
10267 buf1 = _PyUnicode_AsKind(str1, rkind);
10268 if (!buf1) goto error;
10269 release1 = 1;
10270 }
10271 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10272 PyUnicode_GET_LENGTH(str1))); */
10273 product = n * (len2-len1);
10274 if ((product / (len2-len1)) != n) {
10275 PyErr_SetString(PyExc_OverflowError,
10276 "replace string is too long");
10277 goto error;
10278 }
10279 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 if (new_size == 0) {
10281 Py_INCREF(unicode_empty);
10282 u = unicode_empty;
10283 goto done;
10284 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10286 PyErr_SetString(PyExc_OverflowError,
10287 "replace string is too long");
10288 goto error;
10289 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010290 u = PyUnicode_New(new_size, maxchar);
10291 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010293 assert(PyUnicode_KIND(u) == rkind);
10294 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295 ires = i = 0;
10296 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010297 while (n-- > 0) {
10298 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010299 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010300 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010301 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010302 if (j == -1)
10303 break;
10304 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010305 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010306 memcpy(res + rkind * ires,
10307 sbuf + rkind * i,
10308 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010310 }
10311 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010313 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010315 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010321 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010322 memcpy(res + rkind * ires,
10323 sbuf + rkind * i,
10324 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010325 }
10326 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010327 /* interleave */
10328 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010329 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010331 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010333 if (--n <= 0)
10334 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010335 memcpy(res + rkind * ires,
10336 sbuf + rkind * i,
10337 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 ires++;
10339 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010340 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010341 memcpy(res + rkind * ires,
10342 sbuf + rkind * i,
10343 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010344 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010345 }
10346
10347 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010348 unicode_adjust_maxchar(&u);
10349 if (u == NULL)
10350 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010352
10353 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 if (srelease)
10355 PyMem_FREE(sbuf);
10356 if (release1)
10357 PyMem_FREE(buf1);
10358 if (release2)
10359 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010360 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010362
Benjamin Peterson29060642009-01-31 22:14:21 +000010363 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010364 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (srelease)
10366 PyMem_FREE(sbuf);
10367 if (release1)
10368 PyMem_FREE(buf1);
10369 if (release2)
10370 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010371 if (PyUnicode_CheckExact(self)) {
10372 Py_INCREF(self);
10373 return (PyObject *) self;
10374 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010375 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 error:
10377 if (srelease && sbuf)
10378 PyMem_FREE(sbuf);
10379 if (release1 && buf1)
10380 PyMem_FREE(buf1);
10381 if (release2 && buf2)
10382 PyMem_FREE(buf2);
10383 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384}
10385
10386/* --- Unicode Object Methods --------------------------------------------- */
10387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010388PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010389 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390\n\
10391Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010392characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393
10394static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010395unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010396{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010397 return fixup(self, fixtitle);
10398}
10399
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010400PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010401 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010402\n\
10403Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010404have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010405
10406static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010407unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010408{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010409 return fixup(self, fixcapitalize);
10410}
10411
10412#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010413PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010414 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010415\n\
10416Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010417normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010418
10419static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010420unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010421{
10422 PyObject *list;
10423 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010424 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010425
Guido van Rossumd57fd912000-03-10 22:53:23 +000010426 /* Split into words */
10427 list = split(self, NULL, -1);
10428 if (!list)
10429 return NULL;
10430
10431 /* Capitalize each word */
10432 for (i = 0; i < PyList_GET_SIZE(list); i++) {
10433 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010434 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010435 if (item == NULL)
10436 goto onError;
10437 Py_DECREF(PyList_GET_ITEM(list, i));
10438 PyList_SET_ITEM(list, i, item);
10439 }
10440
10441 /* Join the words to form a new string */
10442 item = PyUnicode_Join(NULL, list);
10443
Benjamin Peterson29060642009-01-31 22:14:21 +000010444 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 Py_DECREF(list);
10446 return (PyObject *)item;
10447}
10448#endif
10449
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010450/* Argument converter. Coerces to a single unicode character */
10451
10452static int
10453convert_uc(PyObject *obj, void *addr)
10454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010456 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010457
Benjamin Peterson14339b62009-01-31 16:36:08 +000010458 uniobj = PyUnicode_FromObject(obj);
10459 if (uniobj == NULL) {
10460 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 return 0;
10463 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010465 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010466 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010467 Py_DECREF(uniobj);
10468 return 0;
10469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010471 Py_DECREF(uniobj);
10472 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010473}
10474
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010475PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010476 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010477\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010478Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010479done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010480
10481static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010482unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010483{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010484 Py_ssize_t marg, left;
10485 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010486 Py_UCS4 fillchar = ' ';
10487
Victor Stinnere9a29352011-10-01 02:14:59 +020010488 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010490
Victor Stinnere9a29352011-10-01 02:14:59 +020010491 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492 return NULL;
10493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010495 Py_INCREF(self);
10496 return (PyObject*) self;
10497 }
10498
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 left = marg / 2 + (marg & width & 1);
10501
Victor Stinner9310abb2011-10-05 00:59:23 +020010502 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503}
10504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505/* This function assumes that str1 and str2 are readied by the caller. */
10506
Marc-André Lemburge5034372000-08-08 08:04:29 +000010507static int
10508unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
10509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 int kind1, kind2;
10511 void *data1, *data2;
10512 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 kind1 = PyUnicode_KIND(str1);
10515 kind2 = PyUnicode_KIND(str2);
10516 data1 = PyUnicode_DATA(str1);
10517 data2 = PyUnicode_DATA(str2);
10518 len1 = PyUnicode_GET_LENGTH(str1);
10519 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010520
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 for (i = 0; i < len1 && i < len2; ++i) {
10522 Py_UCS4 c1, c2;
10523 c1 = PyUnicode_READ(kind1, data1, i);
10524 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010525
10526 if (c1 != c2)
10527 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010528 }
10529
10530 return (len1 < len2) ? -1 : (len1 != len2);
10531}
10532
Alexander Belopolsky40018472011-02-26 01:02:56 +000010533int
10534PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10537 if (PyUnicode_READY(left) == -1 ||
10538 PyUnicode_READY(right) == -1)
10539 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010540 return unicode_compare((PyUnicodeObject *)left,
10541 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010543 PyErr_Format(PyExc_TypeError,
10544 "Can't compare %.100s and %.100s",
10545 left->ob_type->tp_name,
10546 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547 return -1;
10548}
10549
Martin v. Löwis5b222132007-06-10 09:51:05 +000010550int
10551PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 Py_ssize_t i;
10554 int kind;
10555 void *data;
10556 Py_UCS4 chr;
10557
Victor Stinner910337b2011-10-03 03:20:16 +020010558 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010559 if (PyUnicode_READY(uni) == -1)
10560 return -1;
10561 kind = PyUnicode_KIND(uni);
10562 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010563 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10565 if (chr != str[i])
10566 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010567 /* This check keeps Python strings that end in '\0' from comparing equal
10568 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010571 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010572 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010573 return 0;
10574}
10575
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010576
Benjamin Peterson29060642009-01-31 22:14:21 +000010577#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010578 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010579
Alexander Belopolsky40018472011-02-26 01:02:56 +000010580PyObject *
10581PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010582{
10583 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010584
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010585 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10586 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 if (PyUnicode_READY(left) == -1 ||
10588 PyUnicode_READY(right) == -1)
10589 return NULL;
10590 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10591 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010592 if (op == Py_EQ) {
10593 Py_INCREF(Py_False);
10594 return Py_False;
10595 }
10596 if (op == Py_NE) {
10597 Py_INCREF(Py_True);
10598 return Py_True;
10599 }
10600 }
10601 if (left == right)
10602 result = 0;
10603 else
10604 result = unicode_compare((PyUnicodeObject *)left,
10605 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010606
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010607 /* Convert the return value to a Boolean */
10608 switch (op) {
10609 case Py_EQ:
10610 v = TEST_COND(result == 0);
10611 break;
10612 case Py_NE:
10613 v = TEST_COND(result != 0);
10614 break;
10615 case Py_LE:
10616 v = TEST_COND(result <= 0);
10617 break;
10618 case Py_GE:
10619 v = TEST_COND(result >= 0);
10620 break;
10621 case Py_LT:
10622 v = TEST_COND(result == -1);
10623 break;
10624 case Py_GT:
10625 v = TEST_COND(result == 1);
10626 break;
10627 default:
10628 PyErr_BadArgument();
10629 return NULL;
10630 }
10631 Py_INCREF(v);
10632 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010633 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010634
Brian Curtindfc80e32011-08-10 20:28:54 -050010635 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010636}
10637
Alexander Belopolsky40018472011-02-26 01:02:56 +000010638int
10639PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010640{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010641 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 int kind1, kind2, kind;
10643 void *buf1, *buf2;
10644 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010645 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010646
10647 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010648 sub = PyUnicode_FromObject(element);
10649 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010650 PyErr_Format(PyExc_TypeError,
10651 "'in <string>' requires string as left operand, not %s",
10652 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 if (PyUnicode_READY(sub) == -1)
10656 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010657
Thomas Wouters477c8d52006-05-27 19:21:47 +000010658 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010659 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010660 Py_DECREF(sub);
10661 return -1;
10662 }
10663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 kind1 = PyUnicode_KIND(str);
10665 kind2 = PyUnicode_KIND(sub);
10666 kind = kind1 > kind2 ? kind1 : kind2;
10667 buf1 = PyUnicode_DATA(str);
10668 buf2 = PyUnicode_DATA(sub);
10669 if (kind1 != kind)
10670 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10671 if (!buf1) {
10672 Py_DECREF(sub);
10673 return -1;
10674 }
10675 if (kind2 != kind)
10676 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10677 if (!buf2) {
10678 Py_DECREF(sub);
10679 if (kind1 != kind) PyMem_Free(buf1);
10680 return -1;
10681 }
10682 len1 = PyUnicode_GET_LENGTH(str);
10683 len2 = PyUnicode_GET_LENGTH(sub);
10684
10685 switch(kind) {
10686 case PyUnicode_1BYTE_KIND:
10687 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10688 break;
10689 case PyUnicode_2BYTE_KIND:
10690 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10691 break;
10692 case PyUnicode_4BYTE_KIND:
10693 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10694 break;
10695 default:
10696 result = -1;
10697 assert(0);
10698 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010699
10700 Py_DECREF(str);
10701 Py_DECREF(sub);
10702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 if (kind1 != kind)
10704 PyMem_Free(buf1);
10705 if (kind2 != kind)
10706 PyMem_Free(buf2);
10707
Guido van Rossum403d68b2000-03-13 15:55:09 +000010708 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010709}
10710
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711/* Concat to string or Unicode object giving a new Unicode object. */
10712
Alexander Belopolsky40018472011-02-26 01:02:56 +000010713PyObject *
10714PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010715{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010717 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718
10719 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010722 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010724 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726
10727 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010728 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010729 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010731 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010732 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010733 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735 }
10736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010738 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10739 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010740
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 w = PyUnicode_New(
10743 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10744 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010746 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010747 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10748 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749 Py_DECREF(u);
10750 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010751 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010753
Benjamin Peterson29060642009-01-31 22:14:21 +000010754 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755 Py_XDECREF(u);
10756 Py_XDECREF(v);
10757 return NULL;
10758}
10759
Victor Stinnerb0923652011-10-04 01:17:31 +020010760static void
10761unicode_append_inplace(PyObject **p_left, PyObject *right)
10762{
10763 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010764
10765 assert(PyUnicode_IS_READY(*p_left));
10766 assert(PyUnicode_IS_READY(right));
10767
10768 left_len = PyUnicode_GET_LENGTH(*p_left);
10769 right_len = PyUnicode_GET_LENGTH(right);
10770 if (left_len > PY_SSIZE_T_MAX - right_len) {
10771 PyErr_SetString(PyExc_OverflowError,
10772 "strings are too large to concat");
10773 goto error;
10774 }
10775 new_len = left_len + right_len;
10776
10777 /* Now we own the last reference to 'left', so we can resize it
10778 * in-place.
10779 */
10780 if (unicode_resize(p_left, new_len) != 0) {
10781 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10782 * deallocated so it cannot be put back into
10783 * 'variable'. The MemoryError is raised when there
10784 * is no value in 'variable', which might (very
10785 * remotely) be a cause of incompatibilities.
10786 */
10787 goto error;
10788 }
10789 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010790 copy_characters(*p_left, left_len, right, 0, right_len);
10791 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010792 return;
10793
10794error:
10795 Py_DECREF(*p_left);
10796 *p_left = NULL;
10797}
10798
Walter Dörwald1ab83302007-05-18 17:15:44 +000010799void
Victor Stinner23e56682011-10-03 03:54:37 +020010800PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010801{
Victor Stinner23e56682011-10-03 03:54:37 +020010802 PyObject *left, *res;
10803
10804 if (p_left == NULL) {
10805 if (!PyErr_Occurred())
10806 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010807 return;
10808 }
Victor Stinner23e56682011-10-03 03:54:37 +020010809 left = *p_left;
10810 if (right == NULL || !PyUnicode_Check(left)) {
10811 if (!PyErr_Occurred())
10812 PyErr_BadInternalCall();
10813 goto error;
10814 }
10815
Victor Stinnere1335c72011-10-04 20:53:03 +020010816 if (PyUnicode_READY(left))
10817 goto error;
10818 if (PyUnicode_READY(right))
10819 goto error;
10820
Victor Stinner23e56682011-10-03 03:54:37 +020010821 if (PyUnicode_CheckExact(left) && left != unicode_empty
10822 && PyUnicode_CheckExact(right) && right != unicode_empty
10823 && unicode_resizable(left)
10824 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10825 || _PyUnicode_WSTR(left) != NULL))
10826 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010827 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10828 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010829 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010830 not so different than duplicating the string. */
10831 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010832 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010833 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010834 if (p_left != NULL)
10835 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010836 return;
10837 }
10838 }
10839
10840 res = PyUnicode_Concat(left, right);
10841 if (res == NULL)
10842 goto error;
10843 Py_DECREF(left);
10844 *p_left = res;
10845 return;
10846
10847error:
10848 Py_DECREF(*p_left);
10849 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010850}
10851
10852void
10853PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10854{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010855 PyUnicode_Append(pleft, right);
10856 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010857}
10858
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010859PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010860 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010862Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010863string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010864interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010865
10866static PyObject *
10867unicode_count(PyUnicodeObject *self, PyObject *args)
10868{
10869 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010870 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010871 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010872 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 int kind1, kind2, kind;
10874 void *buf1, *buf2;
10875 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010876
Jesus Ceaac451502011-04-20 17:09:23 +020010877 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10878 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010879 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 kind1 = PyUnicode_KIND(self);
10882 kind2 = PyUnicode_KIND(substring);
10883 kind = kind1 > kind2 ? kind1 : kind2;
10884 buf1 = PyUnicode_DATA(self);
10885 buf2 = PyUnicode_DATA(substring);
10886 if (kind1 != kind)
10887 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10888 if (!buf1) {
10889 Py_DECREF(substring);
10890 return NULL;
10891 }
10892 if (kind2 != kind)
10893 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10894 if (!buf2) {
10895 Py_DECREF(substring);
10896 if (kind1 != kind) PyMem_Free(buf1);
10897 return NULL;
10898 }
10899 len1 = PyUnicode_GET_LENGTH(self);
10900 len2 = PyUnicode_GET_LENGTH(substring);
10901
10902 ADJUST_INDICES(start, end, len1);
10903 switch(kind) {
10904 case PyUnicode_1BYTE_KIND:
10905 iresult = ucs1lib_count(
10906 ((Py_UCS1*)buf1) + start, end - start,
10907 buf2, len2, PY_SSIZE_T_MAX
10908 );
10909 break;
10910 case PyUnicode_2BYTE_KIND:
10911 iresult = ucs2lib_count(
10912 ((Py_UCS2*)buf1) + start, end - start,
10913 buf2, len2, PY_SSIZE_T_MAX
10914 );
10915 break;
10916 case PyUnicode_4BYTE_KIND:
10917 iresult = ucs4lib_count(
10918 ((Py_UCS4*)buf1) + start, end - start,
10919 buf2, len2, PY_SSIZE_T_MAX
10920 );
10921 break;
10922 default:
10923 assert(0); iresult = 0;
10924 }
10925
10926 result = PyLong_FromSsize_t(iresult);
10927
10928 if (kind1 != kind)
10929 PyMem_Free(buf1);
10930 if (kind2 != kind)
10931 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010932
10933 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935 return result;
10936}
10937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010938PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010939 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010941Encode S using the codec registered for encoding. Default encoding\n\
10942is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010943handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010944a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10945'xmlcharrefreplace' as well as any other name registered with\n\
10946codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010947
10948static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010949unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010950{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010951 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010952 char *encoding = NULL;
10953 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010954
Benjamin Peterson308d6372009-09-18 21:42:35 +000010955 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10956 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010957 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010958 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010959}
10960
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010961PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010962 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963\n\
10964Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010965If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
10967static PyObject*
10968unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10969{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010970 Py_ssize_t i, j, line_pos, src_len, incr;
10971 Py_UCS4 ch;
10972 PyObject *u;
10973 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010975 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010976 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977
10978 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010980
Antoine Pitrou22425222011-10-04 19:10:51 +020010981 if (PyUnicode_READY(self) == -1)
10982 return NULL;
10983
Thomas Wouters7e474022000-07-16 12:04:32 +000010984 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010985 src_len = PyUnicode_GET_LENGTH(self);
10986 i = j = line_pos = 0;
10987 kind = PyUnicode_KIND(self);
10988 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010989 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010990 for (; i < src_len; i++) {
10991 ch = PyUnicode_READ(kind, src_data, i);
10992 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010993 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010995 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010997 goto overflow;
10998 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011000 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011004 goto overflow;
11005 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011007 if (ch == '\n' || ch == '\r')
11008 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011010 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020011011 if (!found && PyUnicode_CheckExact(self)) {
11012 Py_INCREF((PyObject *) self);
11013 return (PyObject *) self;
11014 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011015
Guido van Rossumd57fd912000-03-10 22:53:23 +000011016 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011017 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 if (!u)
11019 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011020 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
Antoine Pitroue71d5742011-10-04 15:55:09 +020011022 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
Antoine Pitroue71d5742011-10-04 15:55:09 +020011024 for (; i < src_len; i++) {
11025 ch = PyUnicode_READ(kind, src_data, i);
11026 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011028 incr = tabsize - (line_pos % tabsize);
11029 line_pos += incr;
11030 while (incr--) {
11031 PyUnicode_WRITE(kind, dest_data, j, ' ');
11032 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011033 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011034 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011035 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011037 line_pos++;
11038 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011039 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011040 if (ch == '\n' || ch == '\r')
11041 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011043 }
11044 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020011045#ifndef DONT_MAKE_RESULT_READY
11046 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 Py_DECREF(u);
11048 return NULL;
11049 }
Victor Stinner17efeed2011-10-04 20:05:46 +020011050#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011051 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011052 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011053
Antoine Pitroue71d5742011-10-04 15:55:09 +020011054 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011055 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11056 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057}
11058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011060 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061\n\
11062Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011063such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064arguments start and end are interpreted as in slice notation.\n\
11065\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011066Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067
11068static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011070{
Jesus Ceaac451502011-04-20 17:09:23 +020011071 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011072 Py_ssize_t start;
11073 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011074 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011075
Jesus Ceaac451502011-04-20 17:09:23 +020011076 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11077 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (PyUnicode_READY(self) == -1)
11081 return NULL;
11082 if (PyUnicode_READY(substring) == -1)
11083 return NULL;
11084
Victor Stinner794d5672011-10-10 03:21:36 +020011085 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011087 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011088
11089 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 if (result == -2)
11092 return NULL;
11093
Christian Heimes217cfd12007-12-02 14:31:20 +000011094 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095}
11096
11097static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011098unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011099{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011100 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
11101 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011104}
11105
Guido van Rossumc2504932007-09-18 19:42:40 +000011106/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011107 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011108static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000011109unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011110{
Guido van Rossumc2504932007-09-18 19:42:40 +000011111 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010011112 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011113
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011114 if (_PyUnicode_HASH(self) != -1)
11115 return _PyUnicode_HASH(self);
11116 if (PyUnicode_READY(self) == -1)
11117 return -1;
11118 len = PyUnicode_GET_LENGTH(self);
11119
11120 /* The hash function as a macro, gets expanded three times below. */
11121#define HASH(P) \
11122 x = (Py_uhash_t)*P << 7; \
11123 while (--len >= 0) \
11124 x = (1000003*x) ^ (Py_uhash_t)*P++;
11125
11126 switch (PyUnicode_KIND(self)) {
11127 case PyUnicode_1BYTE_KIND: {
11128 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11129 HASH(c);
11130 break;
11131 }
11132 case PyUnicode_2BYTE_KIND: {
11133 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11134 HASH(s);
11135 break;
11136 }
11137 default: {
11138 Py_UCS4 *l;
11139 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11140 "Impossible switch case in unicode_hash");
11141 l = PyUnicode_4BYTE_DATA(self);
11142 HASH(l);
11143 break;
11144 }
11145 }
11146 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11147
Guido van Rossumc2504932007-09-18 19:42:40 +000011148 if (x == -1)
11149 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011151 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011154
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011155PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011156 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011157\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011158Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011159
11160static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011162{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011163 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020011164 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011165 Py_ssize_t start;
11166 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011167
Jesus Ceaac451502011-04-20 17:09:23 +020011168 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11169 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011170 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (PyUnicode_READY(self) == -1)
11173 return NULL;
11174 if (PyUnicode_READY(substring) == -1)
11175 return NULL;
11176
Victor Stinner794d5672011-10-10 03:21:36 +020011177 result = any_find_slice(1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011178 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011179 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011180
11181 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011183 if (result == -2)
11184 return NULL;
11185
Guido van Rossumd57fd912000-03-10 22:53:23 +000011186 if (result < 0) {
11187 PyErr_SetString(PyExc_ValueError, "substring not found");
11188 return NULL;
11189 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011190
Christian Heimes217cfd12007-12-02 14:31:20 +000011191 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011192}
11193
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011194PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011195 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011196\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011197Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011198at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199
11200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011201unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011203 Py_ssize_t i, length;
11204 int kind;
11205 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206 int cased;
11207
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011208 if (PyUnicode_READY(self) == -1)
11209 return NULL;
11210 length = PyUnicode_GET_LENGTH(self);
11211 kind = PyUnicode_KIND(self);
11212 data = PyUnicode_DATA(self);
11213
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011215 if (length == 1)
11216 return PyBool_FromLong(
11217 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011219 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011222
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 for (i = 0; i < length; i++) {
11225 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011226
Benjamin Peterson29060642009-01-31 22:14:21 +000011227 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11228 return PyBool_FromLong(0);
11229 else if (!cased && Py_UNICODE_ISLOWER(ch))
11230 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011232 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233}
11234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011235PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011238Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011239at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240
11241static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011242unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244 Py_ssize_t i, length;
11245 int kind;
11246 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247 int cased;
11248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 if (PyUnicode_READY(self) == -1)
11250 return NULL;
11251 length = PyUnicode_GET_LENGTH(self);
11252 kind = PyUnicode_KIND(self);
11253 data = PyUnicode_DATA(self);
11254
Guido van Rossumd57fd912000-03-10 22:53:23 +000011255 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 if (length == 1)
11257 return PyBool_FromLong(
11258 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011260 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011263
Guido van Rossumd57fd912000-03-10 22:53:23 +000011264 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011265 for (i = 0; i < length; i++) {
11266 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011267
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11269 return PyBool_FromLong(0);
11270 else if (!cased && Py_UNICODE_ISUPPER(ch))
11271 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011273 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274}
11275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011276PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011277 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011279Return True if S is a titlecased string and there is at least one\n\
11280character in S, i.e. upper- and titlecase characters may only\n\
11281follow uncased characters and lowercase characters only cased ones.\n\
11282Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
11284static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011285unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 Py_ssize_t i, length;
11288 int kind;
11289 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290 int cased, previous_is_cased;
11291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011292 if (PyUnicode_READY(self) == -1)
11293 return NULL;
11294 length = PyUnicode_GET_LENGTH(self);
11295 kind = PyUnicode_KIND(self);
11296 data = PyUnicode_DATA(self);
11297
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011299 if (length == 1) {
11300 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11301 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11302 (Py_UNICODE_ISUPPER(ch) != 0));
11303 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011304
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011305 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011307 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011308
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 cased = 0;
11310 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311 for (i = 0; i < length; i++) {
11312 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011313
Benjamin Peterson29060642009-01-31 22:14:21 +000011314 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11315 if (previous_is_cased)
11316 return PyBool_FromLong(0);
11317 previous_is_cased = 1;
11318 cased = 1;
11319 }
11320 else if (Py_UNICODE_ISLOWER(ch)) {
11321 if (!previous_is_cased)
11322 return PyBool_FromLong(0);
11323 previous_is_cased = 1;
11324 cased = 1;
11325 }
11326 else
11327 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011329 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330}
11331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011332PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011333 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011335Return True if all characters in S are whitespace\n\
11336and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
11338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011339unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011340{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 Py_ssize_t i, length;
11342 int kind;
11343 void *data;
11344
11345 if (PyUnicode_READY(self) == -1)
11346 return NULL;
11347 length = PyUnicode_GET_LENGTH(self);
11348 kind = PyUnicode_KIND(self);
11349 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350
Guido van Rossumd57fd912000-03-10 22:53:23 +000011351 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 if (length == 1)
11353 return PyBool_FromLong(
11354 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011355
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011356 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011357 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011358 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011359
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011360 for (i = 0; i < length; i++) {
11361 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011362 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011363 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011365 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011366}
11367
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011368PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011369 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011370\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011371Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011372and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011373
11374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011375unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011377 Py_ssize_t i, length;
11378 int kind;
11379 void *data;
11380
11381 if (PyUnicode_READY(self) == -1)
11382 return NULL;
11383 length = PyUnicode_GET_LENGTH(self);
11384 kind = PyUnicode_KIND(self);
11385 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011386
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011387 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (length == 1)
11389 return PyBool_FromLong(
11390 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011391
11392 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011394 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011396 for (i = 0; i < length; i++) {
11397 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011399 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011400 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011401}
11402
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011403PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011404 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011405\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011406Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011408
11409static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011410unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011411{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 int kind;
11413 void *data;
11414 Py_ssize_t len, i;
11415
11416 if (PyUnicode_READY(self) == -1)
11417 return NULL;
11418
11419 kind = PyUnicode_KIND(self);
11420 data = PyUnicode_DATA(self);
11421 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011422
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011423 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (len == 1) {
11425 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11426 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11427 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011428
11429 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 for (i = 0; i < len; i++) {
11434 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011435 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011436 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011437 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011438 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011439}
11440
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011441PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011442 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011444Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011445False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
11447static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011448unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 Py_ssize_t i, length;
11451 int kind;
11452 void *data;
11453
11454 if (PyUnicode_READY(self) == -1)
11455 return NULL;
11456 length = PyUnicode_GET_LENGTH(self);
11457 kind = PyUnicode_KIND(self);
11458 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 if (length == 1)
11462 return PyBool_FromLong(
11463 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011464
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011465 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011467 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 for (i = 0; i < length; i++) {
11470 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011471 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011473 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011474}
11475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011476PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011477 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011479Return True if all characters in S are digits\n\
11480and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011481
11482static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011483unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011485 Py_ssize_t i, length;
11486 int kind;
11487 void *data;
11488
11489 if (PyUnicode_READY(self) == -1)
11490 return NULL;
11491 length = PyUnicode_GET_LENGTH(self);
11492 kind = PyUnicode_KIND(self);
11493 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011496 if (length == 1) {
11497 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11498 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11499 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011501 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011504
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 for (i = 0; i < length; i++) {
11506 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011509 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510}
11511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011512PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011515Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
11518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011519unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 Py_ssize_t i, length;
11522 int kind;
11523 void *data;
11524
11525 if (PyUnicode_READY(self) == -1)
11526 return NULL;
11527 length = PyUnicode_GET_LENGTH(self);
11528 kind = PyUnicode_KIND(self);
11529 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 if (length == 1)
11533 return PyBool_FromLong(
11534 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011536 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011538 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 for (i = 0; i < length; i++) {
11541 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011542 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011544 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011545}
11546
Martin v. Löwis47383402007-08-15 07:32:56 +000011547int
11548PyUnicode_IsIdentifier(PyObject *self)
11549{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011550 int kind;
11551 void *data;
11552 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011553 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 if (PyUnicode_READY(self) == -1) {
11556 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011558 }
11559
11560 /* Special case for empty strings */
11561 if (PyUnicode_GET_LENGTH(self) == 0)
11562 return 0;
11563 kind = PyUnicode_KIND(self);
11564 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011565
11566 /* PEP 3131 says that the first character must be in
11567 XID_Start and subsequent characters in XID_Continue,
11568 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011569 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011570 letters, digits, underscore). However, given the current
11571 definition of XID_Start and XID_Continue, it is sufficient
11572 to check just for these, except that _ must be allowed
11573 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011575 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011576 return 0;
11577
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011578 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011581 return 1;
11582}
11583
11584PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011585 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011586\n\
11587Return True if S is a valid identifier according\n\
11588to the language definition.");
11589
11590static PyObject*
11591unicode_isidentifier(PyObject *self)
11592{
11593 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11594}
11595
Georg Brandl559e5d72008-06-11 18:37:52 +000011596PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011597 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011598\n\
11599Return True if all characters in S are considered\n\
11600printable in repr() or S is empty, False otherwise.");
11601
11602static PyObject*
11603unicode_isprintable(PyObject *self)
11604{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 Py_ssize_t i, length;
11606 int kind;
11607 void *data;
11608
11609 if (PyUnicode_READY(self) == -1)
11610 return NULL;
11611 length = PyUnicode_GET_LENGTH(self);
11612 kind = PyUnicode_KIND(self);
11613 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011614
11615 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616 if (length == 1)
11617 return PyBool_FromLong(
11618 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011619
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 for (i = 0; i < length; i++) {
11621 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011622 Py_RETURN_FALSE;
11623 }
11624 }
11625 Py_RETURN_TRUE;
11626}
11627
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011628PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011629 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011630\n\
11631Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011632iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633
11634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011635unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011637 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638}
11639
Martin v. Löwis18e16552006-02-15 17:27:45 +000011640static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641unicode_length(PyUnicodeObject *self)
11642{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 if (PyUnicode_READY(self) == -1)
11644 return -1;
11645 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646}
11647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011650\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011651Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011652done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653
11654static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011655unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011657 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 Py_UCS4 fillchar = ' ';
11659
11660 if (PyUnicode_READY(self) == -1)
11661 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011662
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011663 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664 return NULL;
11665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667 Py_INCREF(self);
11668 return (PyObject*) self;
11669 }
11670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672}
11673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011674PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011677Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011678
11679static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011680unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682 return fixup(self, fixlower);
11683}
11684
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011685#define LEFTSTRIP 0
11686#define RIGHTSTRIP 1
11687#define BOTHSTRIP 2
11688
11689/* Arrays indexed by above */
11690static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11691
11692#define STRIPNAME(i) (stripformat[i]+3)
11693
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694/* externally visible for str.strip(unicode) */
11695PyObject *
11696_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11697{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 void *data;
11699 int kind;
11700 Py_ssize_t i, j, len;
11701 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11704 return NULL;
11705
11706 kind = PyUnicode_KIND(self);
11707 data = PyUnicode_DATA(self);
11708 len = PyUnicode_GET_LENGTH(self);
11709 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11710 PyUnicode_DATA(sepobj),
11711 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011712
Benjamin Peterson14339b62009-01-31 16:36:08 +000011713 i = 0;
11714 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 while (i < len &&
11716 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011717 i++;
11718 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011719 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011720
Benjamin Peterson14339b62009-01-31 16:36:08 +000011721 j = len;
11722 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011723 do {
11724 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011725 } while (j >= i &&
11726 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011727 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011728 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729
Victor Stinner12bab6d2011-10-01 01:53:49 +020011730 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011731}
11732
11733PyObject*
11734PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11735{
11736 unsigned char *data;
11737 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011738 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739
Victor Stinnerde636f32011-10-01 03:55:54 +020011740 if (PyUnicode_READY(self) == -1)
11741 return NULL;
11742
11743 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11744
Victor Stinner12bab6d2011-10-01 01:53:49 +020011745 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011746 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011747 if (PyUnicode_CheckExact(self)) {
11748 Py_INCREF(self);
11749 return self;
11750 }
11751 else
11752 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 }
11754
Victor Stinner12bab6d2011-10-01 01:53:49 +020011755 length = end - start;
11756 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011757 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758
Victor Stinnerde636f32011-10-01 03:55:54 +020011759 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011760 PyErr_SetString(PyExc_IndexError, "string index out of range");
11761 return NULL;
11762 }
11763
Victor Stinnerb9275c12011-10-05 14:01:42 +020011764 if (PyUnicode_IS_ASCII(self)) {
11765 kind = PyUnicode_KIND(self);
11766 data = PyUnicode_1BYTE_DATA(self);
11767 return unicode_fromascii(data + start, length);
11768 }
11769 else {
11770 kind = PyUnicode_KIND(self);
11771 data = PyUnicode_1BYTE_DATA(self);
11772 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011773 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011774 length);
11775 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011777
11778static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011779do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 int kind;
11782 void *data;
11783 Py_ssize_t len, i, j;
11784
11785 if (PyUnicode_READY(self) == -1)
11786 return NULL;
11787
11788 kind = PyUnicode_KIND(self);
11789 data = PyUnicode_DATA(self);
11790 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011791
Benjamin Peterson14339b62009-01-31 16:36:08 +000011792 i = 0;
11793 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011795 i++;
11796 }
11797 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011798
Benjamin Peterson14339b62009-01-31 16:36:08 +000011799 j = len;
11800 if (striptype != LEFTSTRIP) {
11801 do {
11802 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011804 j++;
11805 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011806
Victor Stinner12bab6d2011-10-01 01:53:49 +020011807 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808}
11809
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011810
11811static PyObject *
11812do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11813{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011815
Benjamin Peterson14339b62009-01-31 16:36:08 +000011816 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11817 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011818
Benjamin Peterson14339b62009-01-31 16:36:08 +000011819 if (sep != NULL && sep != Py_None) {
11820 if (PyUnicode_Check(sep))
11821 return _PyUnicode_XStrip(self, striptype, sep);
11822 else {
11823 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 "%s arg must be None or str",
11825 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011826 return NULL;
11827 }
11828 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011829
Benjamin Peterson14339b62009-01-31 16:36:08 +000011830 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011831}
11832
11833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011834PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011835 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011836\n\
11837Return a copy of the string S with leading and trailing\n\
11838whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011839If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011840
11841static PyObject *
11842unicode_strip(PyUnicodeObject *self, PyObject *args)
11843{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011844 if (PyTuple_GET_SIZE(args) == 0)
11845 return do_strip(self, BOTHSTRIP); /* Common case */
11846 else
11847 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011848}
11849
11850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011851PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011852 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011853\n\
11854Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011855If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011856
11857static PyObject *
11858unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11859{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011860 if (PyTuple_GET_SIZE(args) == 0)
11861 return do_strip(self, LEFTSTRIP); /* Common case */
11862 else
11863 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011864}
11865
11866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011867PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011869\n\
11870Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011871If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011872
11873static PyObject *
11874unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11875{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011876 if (PyTuple_GET_SIZE(args) == 0)
11877 return do_strip(self, RIGHTSTRIP); /* Common case */
11878 else
11879 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011880}
11881
11882
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011884unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885{
11886 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888
Georg Brandl222de0f2009-04-12 12:01:50 +000011889 if (len < 1) {
11890 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011891 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Tim Peters7a29bd52001-09-12 03:03:31 +000011894 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895 /* no repeat, return original string */
11896 Py_INCREF(str);
11897 return (PyObject*) str;
11898 }
Tim Peters8f422462000-09-09 06:13:41 +000011899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 if (PyUnicode_READY(str) == -1)
11901 return NULL;
11902
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011903 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011904 PyErr_SetString(PyExc_OverflowError,
11905 "repeated string is too long");
11906 return NULL;
11907 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011909
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011911 if (!u)
11912 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011913 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 if (PyUnicode_GET_LENGTH(str) == 1) {
11916 const int kind = PyUnicode_KIND(str);
11917 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11918 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011919 if (kind == PyUnicode_1BYTE_KIND)
11920 memset(to, (unsigned char)fill_char, len);
11921 else {
11922 for (n = 0; n < len; ++n)
11923 PyUnicode_WRITE(kind, to, n, fill_char);
11924 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 }
11926 else {
11927 /* number of characters copied this far */
11928 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011929 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011930 char *to = (char *) PyUnicode_DATA(u);
11931 Py_MEMCPY(to, PyUnicode_DATA(str),
11932 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011934 n = (done <= nchars-done) ? done : nchars-done;
11935 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011936 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 }
11939
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011940 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 return (PyObject*) u;
11942}
11943
Alexander Belopolsky40018472011-02-26 01:02:56 +000011944PyObject *
11945PyUnicode_Replace(PyObject *obj,
11946 PyObject *subobj,
11947 PyObject *replobj,
11948 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949{
11950 PyObject *self;
11951 PyObject *str1;
11952 PyObject *str2;
11953 PyObject *result;
11954
11955 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011956 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011958 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011959 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 Py_DECREF(self);
11961 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011962 }
11963 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011964 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 Py_DECREF(self);
11966 Py_DECREF(str1);
11967 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970 Py_DECREF(self);
11971 Py_DECREF(str1);
11972 Py_DECREF(str2);
11973 return result;
11974}
11975
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011976PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011977 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978\n\
11979Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011980old replaced by new. If the optional argument count is\n\
11981given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982
11983static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011984unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011986 PyObject *str1;
11987 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011988 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989 PyObject *result;
11990
Martin v. Löwis18e16552006-02-15 17:27:45 +000011991 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 str1 = PyUnicode_FromObject(str1);
11996 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11997 return NULL;
11998 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011999 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012000 Py_DECREF(str1);
12001 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012002 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004 result = replace(self, str1, str2, maxcount);
12005
12006 Py_DECREF(str1);
12007 Py_DECREF(str2);
12008 return result;
12009}
12010
Alexander Belopolsky40018472011-02-26 01:02:56 +000012011static PyObject *
12012unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012014 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012015 Py_ssize_t isize;
12016 Py_ssize_t osize, squote, dquote, i, o;
12017 Py_UCS4 max, quote;
12018 int ikind, okind;
12019 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012022 return NULL;
12023
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 isize = PyUnicode_GET_LENGTH(unicode);
12025 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012026
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 /* Compute length of output, quote characters, and
12028 maximum character */
12029 osize = 2; /* quotes */
12030 max = 127;
12031 squote = dquote = 0;
12032 ikind = PyUnicode_KIND(unicode);
12033 for (i = 0; i < isize; i++) {
12034 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12035 switch (ch) {
12036 case '\'': squote++; osize++; break;
12037 case '"': dquote++; osize++; break;
12038 case '\\': case '\t': case '\r': case '\n':
12039 osize += 2; break;
12040 default:
12041 /* Fast-path ASCII */
12042 if (ch < ' ' || ch == 0x7f)
12043 osize += 4; /* \xHH */
12044 else if (ch < 0x7f)
12045 osize++;
12046 else if (Py_UNICODE_ISPRINTABLE(ch)) {
12047 osize++;
12048 max = ch > max ? ch : max;
12049 }
12050 else if (ch < 0x100)
12051 osize += 4; /* \xHH */
12052 else if (ch < 0x10000)
12053 osize += 6; /* \uHHHH */
12054 else
12055 osize += 10; /* \uHHHHHHHH */
12056 }
12057 }
12058
12059 quote = '\'';
12060 if (squote) {
12061 if (dquote)
12062 /* Both squote and dquote present. Use squote,
12063 and escape them */
12064 osize += squote;
12065 else
12066 quote = '"';
12067 }
12068
12069 repr = PyUnicode_New(osize, max);
12070 if (repr == NULL)
12071 return NULL;
12072 okind = PyUnicode_KIND(repr);
12073 odata = PyUnicode_DATA(repr);
12074
12075 PyUnicode_WRITE(okind, odata, 0, quote);
12076 PyUnicode_WRITE(okind, odata, osize-1, quote);
12077
12078 for (i = 0, o = 1; i < isize; i++) {
12079 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012080
12081 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if ((ch == quote) || (ch == '\\')) {
12083 PyUnicode_WRITE(okind, odata, o++, '\\');
12084 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012085 continue;
12086 }
12087
Benjamin Peterson29060642009-01-31 22:14:21 +000012088 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012089 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012090 PyUnicode_WRITE(okind, odata, o++, '\\');
12091 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012092 }
12093 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012094 PyUnicode_WRITE(okind, odata, o++, '\\');
12095 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012096 }
12097 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 PyUnicode_WRITE(okind, odata, o++, '\\');
12099 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000012100 }
12101
12102 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000012103 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012104 PyUnicode_WRITE(okind, odata, o++, '\\');
12105 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012106 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12107 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012108 }
12109
Georg Brandl559e5d72008-06-11 18:37:52 +000012110 /* Copy ASCII characters as-is */
12111 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012113 }
12114
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000012116 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000012118 (categories Z* and C* except ASCII space)
12119 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012120 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012121 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (ch <= 0xff) {
12123 PyUnicode_WRITE(okind, odata, o++, '\\');
12124 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012125 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12126 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012127 }
12128 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 else if (ch >= 0x10000) {
12130 PyUnicode_WRITE(okind, odata, o++, '\\');
12131 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012132 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12133 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12134 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12135 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12136 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12137 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12138 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12139 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012140 }
12141 /* Map 16-bit characters to '\uxxxx' */
12142 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 PyUnicode_WRITE(okind, odata, o++, '\\');
12144 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020012145 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12146 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12147 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12148 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012149 }
12150 }
12151 /* Copy characters as-is */
12152 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012154 }
12155 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012157 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012158 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012159 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160}
12161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012162PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012163 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164\n\
12165Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012166such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167arguments start and end are interpreted as in slice notation.\n\
12168\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012169Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
12171static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012172unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173{
Jesus Ceaac451502011-04-20 17:09:23 +020012174 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012175 Py_ssize_t start;
12176 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012177 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Jesus Ceaac451502011-04-20 17:09:23 +020012179 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12180 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012181 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 if (PyUnicode_READY(self) == -1)
12184 return NULL;
12185 if (PyUnicode_READY(substring) == -1)
12186 return NULL;
12187
Victor Stinner794d5672011-10-10 03:21:36 +020012188 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012190 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191
12192 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012193
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 if (result == -2)
12195 return NULL;
12196
Christian Heimes217cfd12007-12-02 14:31:20 +000012197 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198}
12199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012200PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012201 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012203Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204
12205static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207{
Jesus Ceaac451502011-04-20 17:09:23 +020012208 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012209 Py_ssize_t start;
12210 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012211 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212
Jesus Ceaac451502011-04-20 17:09:23 +020012213 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12214 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012217 if (PyUnicode_READY(self) == -1)
12218 return NULL;
12219 if (PyUnicode_READY(substring) == -1)
12220 return NULL;
12221
Victor Stinner794d5672011-10-10 03:21:36 +020012222 result = any_find_slice(-1,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000012224 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225
12226 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (result == -2)
12229 return NULL;
12230
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 if (result < 0) {
12232 PyErr_SetString(PyExc_ValueError, "substring not found");
12233 return NULL;
12234 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235
Christian Heimes217cfd12007-12-02 14:31:20 +000012236 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012239PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012240 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012242Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012243done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244
12245static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012246unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012248 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012249 Py_UCS4 fillchar = ' ';
12250
Victor Stinnere9a29352011-10-01 02:14:59 +020012251 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012252 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012253
Victor Stinnere9a29352011-10-01 02:14:59 +020012254 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255 return NULL;
12256
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012257 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 Py_INCREF(self);
12259 return (PyObject*) self;
12260 }
12261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012262 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263}
12264
Alexander Belopolsky40018472011-02-26 01:02:56 +000012265PyObject *
12266PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267{
12268 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012269
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270 s = PyUnicode_FromObject(s);
12271 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012272 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012273 if (sep != NULL) {
12274 sep = PyUnicode_FromObject(sep);
12275 if (sep == NULL) {
12276 Py_DECREF(s);
12277 return NULL;
12278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012279 }
12280
Victor Stinner9310abb2011-10-05 00:59:23 +020012281 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282
12283 Py_DECREF(s);
12284 Py_XDECREF(sep);
12285 return result;
12286}
12287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012288PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290\n\
12291Return a list of the words in S, using sep as the\n\
12292delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012293splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012294whitespace string is a separator and empty strings are\n\
12295removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296
12297static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012298unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299{
12300 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012301 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
Martin v. Löwis18e16552006-02-15 17:27:45 +000012303 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 return NULL;
12305
12306 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012307 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012308 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012309 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310 else
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012312}
12313
Thomas Wouters477c8d52006-05-27 19:21:47 +000012314PyObject *
12315PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12316{
12317 PyObject* str_obj;
12318 PyObject* sep_obj;
12319 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 int kind1, kind2, kind;
12321 void *buf1 = NULL, *buf2 = NULL;
12322 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012323
12324 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012325 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012326 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012327 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012329 Py_DECREF(str_obj);
12330 return NULL;
12331 }
12332
Victor Stinner14f8f022011-10-05 20:58:25 +020012333 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012334 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012335 kind = Py_MAX(kind1, kind2);
12336 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012338 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 if (!buf1)
12340 goto onError;
12341 buf2 = PyUnicode_DATA(sep_obj);
12342 if (kind2 != kind)
12343 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12344 if (!buf2)
12345 goto onError;
12346 len1 = PyUnicode_GET_LENGTH(str_obj);
12347 len2 = PyUnicode_GET_LENGTH(sep_obj);
12348
Victor Stinner14f8f022011-10-05 20:58:25 +020012349 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012351 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12352 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12353 else
12354 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 break;
12356 case PyUnicode_2BYTE_KIND:
12357 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12358 break;
12359 case PyUnicode_4BYTE_KIND:
12360 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12361 break;
12362 default:
12363 assert(0);
12364 out = 0;
12365 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012366
12367 Py_DECREF(sep_obj);
12368 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 if (kind1 != kind)
12370 PyMem_Free(buf1);
12371 if (kind2 != kind)
12372 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012373
12374 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 onError:
12376 Py_DECREF(sep_obj);
12377 Py_DECREF(str_obj);
12378 if (kind1 != kind && buf1)
12379 PyMem_Free(buf1);
12380 if (kind2 != kind && buf2)
12381 PyMem_Free(buf2);
12382 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012383}
12384
12385
12386PyObject *
12387PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12388{
12389 PyObject* str_obj;
12390 PyObject* sep_obj;
12391 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 int kind1, kind2, kind;
12393 void *buf1 = NULL, *buf2 = NULL;
12394 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012395
12396 str_obj = PyUnicode_FromObject(str_in);
12397 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012398 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012399 sep_obj = PyUnicode_FromObject(sep_in);
12400 if (!sep_obj) {
12401 Py_DECREF(str_obj);
12402 return NULL;
12403 }
12404
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012405 kind1 = PyUnicode_KIND(str_in);
12406 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012407 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 buf1 = PyUnicode_DATA(str_in);
12409 if (kind1 != kind)
12410 buf1 = _PyUnicode_AsKind(str_in, kind);
12411 if (!buf1)
12412 goto onError;
12413 buf2 = PyUnicode_DATA(sep_obj);
12414 if (kind2 != kind)
12415 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12416 if (!buf2)
12417 goto onError;
12418 len1 = PyUnicode_GET_LENGTH(str_obj);
12419 len2 = PyUnicode_GET_LENGTH(sep_obj);
12420
12421 switch(PyUnicode_KIND(str_in)) {
12422 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012423 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12424 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12425 else
12426 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 break;
12428 case PyUnicode_2BYTE_KIND:
12429 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12430 break;
12431 case PyUnicode_4BYTE_KIND:
12432 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12433 break;
12434 default:
12435 assert(0);
12436 out = 0;
12437 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012438
12439 Py_DECREF(sep_obj);
12440 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012441 if (kind1 != kind)
12442 PyMem_Free(buf1);
12443 if (kind2 != kind)
12444 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012445
12446 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 onError:
12448 Py_DECREF(sep_obj);
12449 Py_DECREF(str_obj);
12450 if (kind1 != kind && buf1)
12451 PyMem_Free(buf1);
12452 if (kind2 != kind && buf2)
12453 PyMem_Free(buf2);
12454 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012455}
12456
12457PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012458 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012459\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012460Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012461the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012462found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012463
12464static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012465unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012466{
Victor Stinner9310abb2011-10-05 00:59:23 +020012467 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012468}
12469
12470PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012471 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012472\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012473Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012474the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012475separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012476
12477static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012478unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012479{
Victor Stinner9310abb2011-10-05 00:59:23 +020012480 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012481}
12482
Alexander Belopolsky40018472011-02-26 01:02:56 +000012483PyObject *
12484PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012485{
12486 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012487
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012488 s = PyUnicode_FromObject(s);
12489 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012490 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012491 if (sep != NULL) {
12492 sep = PyUnicode_FromObject(sep);
12493 if (sep == NULL) {
12494 Py_DECREF(s);
12495 return NULL;
12496 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012497 }
12498
Victor Stinner9310abb2011-10-05 00:59:23 +020012499 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012500
12501 Py_DECREF(s);
12502 Py_XDECREF(sep);
12503 return result;
12504}
12505
12506PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012507 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012508\n\
12509Return a list of the words in S, using sep as the\n\
12510delimiter string, starting at the end of the string and\n\
12511working to the front. If maxsplit is given, at most maxsplit\n\
12512splits are done. If sep is not specified, any whitespace string\n\
12513is a separator.");
12514
12515static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012516unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012517{
12518 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012519 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012520
Martin v. Löwis18e16552006-02-15 17:27:45 +000012521 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012522 return NULL;
12523
12524 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012525 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012526 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012527 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012528 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012529 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012530}
12531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012532PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012533 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534\n\
12535Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012536Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012537is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
12539static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012540unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012542 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012543 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012545 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12546 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547 return NULL;
12548
Guido van Rossum86662912000-04-11 15:38:46 +000012549 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012550}
12551
12552static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012553PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554{
Walter Dörwald346737f2007-05-31 10:44:43 +000012555 if (PyUnicode_CheckExact(self)) {
12556 Py_INCREF(self);
12557 return self;
12558 } else
12559 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012560 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561}
12562
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012563PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012564 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565\n\
12566Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012567and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012568
12569static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012570unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572 return fixup(self, fixswapcase);
12573}
12574
Georg Brandlceee0772007-11-27 23:48:05 +000012575PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012576 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012577\n\
12578Return a translation table usable for str.translate().\n\
12579If there is only one argument, it must be a dictionary mapping Unicode\n\
12580ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012581Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012582If there are two arguments, they must be strings of equal length, and\n\
12583in the resulting dictionary, each character in x will be mapped to the\n\
12584character at the same position in y. If there is a third argument, it\n\
12585must be a string, whose characters will be mapped to None in the result.");
12586
12587static PyObject*
12588unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12589{
12590 PyObject *x, *y = NULL, *z = NULL;
12591 PyObject *new = NULL, *key, *value;
12592 Py_ssize_t i = 0;
12593 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012594
Georg Brandlceee0772007-11-27 23:48:05 +000012595 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12596 return NULL;
12597 new = PyDict_New();
12598 if (!new)
12599 return NULL;
12600 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012601 int x_kind, y_kind, z_kind;
12602 void *x_data, *y_data, *z_data;
12603
Georg Brandlceee0772007-11-27 23:48:05 +000012604 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012605 if (!PyUnicode_Check(x)) {
12606 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12607 "be a string if there is a second argument");
12608 goto err;
12609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012610 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012611 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12612 "arguments must have equal length");
12613 goto err;
12614 }
12615 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012616 x_kind = PyUnicode_KIND(x);
12617 y_kind = PyUnicode_KIND(y);
12618 x_data = PyUnicode_DATA(x);
12619 y_data = PyUnicode_DATA(y);
12620 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12621 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12622 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012623 if (!key || !value)
12624 goto err;
12625 res = PyDict_SetItem(new, key, value);
12626 Py_DECREF(key);
12627 Py_DECREF(value);
12628 if (res < 0)
12629 goto err;
12630 }
12631 /* create entries for deleting chars in z */
12632 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 z_kind = PyUnicode_KIND(z);
12634 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012635 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012637 if (!key)
12638 goto err;
12639 res = PyDict_SetItem(new, key, Py_None);
12640 Py_DECREF(key);
12641 if (res < 0)
12642 goto err;
12643 }
12644 }
12645 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 int kind;
12647 void *data;
12648
Georg Brandlceee0772007-11-27 23:48:05 +000012649 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012650 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012651 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12652 "to maketrans it must be a dict");
12653 goto err;
12654 }
12655 /* copy entries into the new dict, converting string keys to int keys */
12656 while (PyDict_Next(x, &i, &key, &value)) {
12657 if (PyUnicode_Check(key)) {
12658 /* convert string keys to integer keys */
12659 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012660 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012661 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12662 "table must be of length 1");
12663 goto err;
12664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 kind = PyUnicode_KIND(key);
12666 data = PyUnicode_DATA(key);
12667 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012668 if (!newkey)
12669 goto err;
12670 res = PyDict_SetItem(new, newkey, value);
12671 Py_DECREF(newkey);
12672 if (res < 0)
12673 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012674 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012675 /* just keep integer keys */
12676 if (PyDict_SetItem(new, key, value) < 0)
12677 goto err;
12678 } else {
12679 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12680 "be strings or integers");
12681 goto err;
12682 }
12683 }
12684 }
12685 return new;
12686 err:
12687 Py_DECREF(new);
12688 return NULL;
12689}
12690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012691PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012692 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012693\n\
12694Return a copy of the string S, where all characters have been mapped\n\
12695through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012696Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012697Unmapped characters are left untouched. Characters mapped to None\n\
12698are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012699
12700static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704}
12705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012706PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012709Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012710
12711static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012712unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714 return fixup(self, fixupper);
12715}
12716
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012717PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012718 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012719\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012720Pad a numeric string S with zeros on the left, to fill a field\n\
12721of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012722
12723static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012724unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012726 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012727 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012728 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 int kind;
12730 void *data;
12731 Py_UCS4 chr;
12732
12733 if (PyUnicode_READY(self) == -1)
12734 return NULL;
12735
Martin v. Löwis18e16552006-02-15 17:27:45 +000012736 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012737 return NULL;
12738
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012740 if (PyUnicode_CheckExact(self)) {
12741 Py_INCREF(self);
12742 return (PyObject*) self;
12743 }
12744 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012745 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012746 }
12747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012749
12750 u = pad(self, fill, 0, '0');
12751
Walter Dörwald068325e2002-04-15 13:36:47 +000012752 if (u == NULL)
12753 return NULL;
12754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 kind = PyUnicode_KIND(u);
12756 data = PyUnicode_DATA(u);
12757 chr = PyUnicode_READ(kind, data, fill);
12758
12759 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012760 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012761 PyUnicode_WRITE(kind, data, 0, chr);
12762 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012763 }
12764
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012765 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766 return (PyObject*) u;
12767}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768
12769#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012770static PyObject *
12771unicode__decimal2ascii(PyObject *self)
12772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012774}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012775#endif
12776
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012777PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012778 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012779\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012780Return True if S starts with the specified prefix, False otherwise.\n\
12781With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012782With optional end, stop comparing S at that position.\n\
12783prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784
12785static PyObject *
12786unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012789 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012791 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012792 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012793 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
Jesus Ceaac451502011-04-20 17:09:23 +020012795 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012796 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012797 if (PyTuple_Check(subobj)) {
12798 Py_ssize_t i;
12799 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12800 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012801 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012802 if (substring == NULL)
12803 return NULL;
12804 result = tailmatch(self, substring, start, end, -1);
12805 Py_DECREF(substring);
12806 if (result) {
12807 Py_RETURN_TRUE;
12808 }
12809 }
12810 /* nothing matched */
12811 Py_RETURN_FALSE;
12812 }
12813 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012814 if (substring == NULL) {
12815 if (PyErr_ExceptionMatches(PyExc_TypeError))
12816 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12817 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012819 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012820 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012821 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012822 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823}
12824
12825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012826PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012827 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012829Return True if S ends with the specified suffix, False otherwise.\n\
12830With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012831With optional end, stop comparing S at that position.\n\
12832suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833
12834static PyObject *
12835unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012836 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012838 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012840 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012841 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012842 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843
Jesus Ceaac451502011-04-20 17:09:23 +020012844 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012845 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012846 if (PyTuple_Check(subobj)) {
12847 Py_ssize_t i;
12848 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12849 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012850 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012851 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012852 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012853 result = tailmatch(self, substring, start, end, +1);
12854 Py_DECREF(substring);
12855 if (result) {
12856 Py_RETURN_TRUE;
12857 }
12858 }
12859 Py_RETURN_FALSE;
12860 }
12861 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012862 if (substring == NULL) {
12863 if (PyErr_ExceptionMatches(PyExc_TypeError))
12864 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12865 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012866 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012867 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012868 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012870 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871}
12872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012874
12875PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012877\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012878Return a formatted version of S, using substitutions from args and kwargs.\n\
12879The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012880
Eric Smith27bbca62010-11-04 17:06:58 +000012881PyDoc_STRVAR(format_map__doc__,
12882 "S.format_map(mapping) -> str\n\
12883\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012884Return a formatted version of S, using substitutions from mapping.\n\
12885The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012886
Eric Smith4a7d76d2008-05-30 18:10:19 +000012887static PyObject *
12888unicode__format__(PyObject* self, PyObject* args)
12889{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012890 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012891
12892 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12893 return NULL;
12894
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012895 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012896 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012897 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012898}
12899
Eric Smith8c663262007-08-25 02:26:07 +000012900PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012901 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012902\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012903Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012904
12905static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012906unicode__sizeof__(PyUnicodeObject *v)
12907{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012908 Py_ssize_t size;
12909
12910 /* If it's a compact object, account for base structure +
12911 character data. */
12912 if (PyUnicode_IS_COMPACT_ASCII(v))
12913 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12914 else if (PyUnicode_IS_COMPACT(v))
12915 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012916 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917 else {
12918 /* If it is a two-block object, account for base object, and
12919 for character block if present. */
12920 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012921 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012922 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012923 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 }
12925 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012926 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012927 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012929 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012930 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931
12932 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012933}
12934
12935PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012937
12938static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012939unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012940{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012941 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012942 if (!copy)
12943 return NULL;
12944 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012945}
12946
Guido van Rossumd57fd912000-03-10 22:53:23 +000012947static PyMethodDef unicode_methods[] = {
12948
12949 /* Order is according to common usage: often used methods should
12950 appear first, since lookup is done sequentially. */
12951
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012952 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012953 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12954 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012955 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012956 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12957 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12958 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12959 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12960 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12961 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12962 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012963 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012964 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12965 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12966 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012967 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012968 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12969 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12970 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012971 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012973 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012974 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012975 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12976 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12977 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12978 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12979 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12980 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12981 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12982 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12983 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12984 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12985 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12986 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12987 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12988 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012989 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012990 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012991 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012992 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012993 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012994 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012995 {"maketrans", (PyCFunction) unicode_maketrans,
12996 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012997 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012998#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012999 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013000#endif
13001
13002#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013003 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013004 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005#endif
13006
Benjamin Peterson14339b62009-01-31 16:36:08 +000013007 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008 {NULL, NULL}
13009};
13010
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013011static PyObject *
13012unicode_mod(PyObject *v, PyObject *w)
13013{
Brian Curtindfc80e32011-08-10 20:28:54 -050013014 if (!PyUnicode_Check(v))
13015 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013017}
13018
13019static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013020 0, /*nb_add*/
13021 0, /*nb_subtract*/
13022 0, /*nb_multiply*/
13023 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013024};
13025
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013027 (lenfunc) unicode_length, /* sq_length */
13028 PyUnicode_Concat, /* sq_concat */
13029 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13030 (ssizeargfunc) unicode_getitem, /* sq_item */
13031 0, /* sq_slice */
13032 0, /* sq_ass_item */
13033 0, /* sq_ass_slice */
13034 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035};
13036
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013037static PyObject*
13038unicode_subscript(PyUnicodeObject* self, PyObject* item)
13039{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013040 if (PyUnicode_READY(self) == -1)
13041 return NULL;
13042
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013043 if (PyIndex_Check(item)) {
13044 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013045 if (i == -1 && PyErr_Occurred())
13046 return NULL;
13047 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020013049 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013050 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013051 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013052 PyObject *result;
13053 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013054 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013055 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013057 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013058 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013059 return NULL;
13060 }
13061
13062 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013063 return PyUnicode_New(0, 0);
13064 } else if (start == 0 && step == 1 &&
13065 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000013066 PyUnicode_CheckExact(self)) {
13067 Py_INCREF(self);
13068 return (PyObject *)self;
13069 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020013070 return PyUnicode_Substring((PyObject*)self,
13071 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013072 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013073 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013074 src_kind = PyUnicode_KIND(self);
13075 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013076 if (!PyUnicode_IS_ASCII(self)) {
13077 kind_limit = kind_maxchar_limit(src_kind);
13078 max_char = 0;
13079 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13080 ch = PyUnicode_READ(src_kind, src_data, cur);
13081 if (ch > max_char) {
13082 max_char = ch;
13083 if (max_char >= kind_limit)
13084 break;
13085 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013086 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013087 }
Victor Stinner55c99112011-10-13 01:17:06 +020013088 else
13089 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013090 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013091 if (result == NULL)
13092 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013093 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013094 dest_data = PyUnicode_DATA(result);
13095
13096 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013097 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13098 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013099 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013100 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013101 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013102 } else {
13103 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13104 return NULL;
13105 }
13106}
13107
13108static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013109 (lenfunc)unicode_length, /* mp_length */
13110 (binaryfunc)unicode_subscript, /* mp_subscript */
13111 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013112};
13113
Guido van Rossumd57fd912000-03-10 22:53:23 +000013114
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115/* Helpers for PyUnicode_Format() */
13116
13117static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000013118getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013120 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 (*p_argidx)++;
13123 if (arglen < 0)
13124 return args;
13125 else
13126 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127 }
13128 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130 return NULL;
13131}
13132
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013133/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013135static PyObject *
13136formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013138 char *p;
13139 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140 double x;
Tim Petersced69f82003-09-16 20:30:58 +000013141
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142 x = PyFloat_AsDouble(v);
13143 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013144 return NULL;
13145
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013147 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013148
Eric Smith0923d1d2009-04-16 20:16:10 +000013149 p = PyOS_double_to_string(x, type, prec,
13150 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013151 if (p == NULL)
13152 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013154 PyMem_Free(p);
13155 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156}
13157
Tim Peters38fd5b62000-09-21 05:43:11 +000013158static PyObject*
13159formatlong(PyObject *val, int flags, int prec, int type)
13160{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013161 char *buf;
13162 int len;
13163 PyObject *str; /* temporary string object. */
13164 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013165
Benjamin Peterson14339b62009-01-31 16:36:08 +000013166 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13167 if (!str)
13168 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013170 Py_DECREF(str);
13171 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013172}
13173
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013174static Py_UCS4
13175formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013177 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013178 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013179 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013180 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013181 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 goto onError;
13183 }
13184 else {
13185 /* Integer input truncated to a character */
13186 long x;
13187 x = PyLong_AsLong(v);
13188 if (x == -1 && PyErr_Occurred())
13189 goto onError;
13190
13191 if (x < 0 || x > 0x10ffff) {
13192 PyErr_SetString(PyExc_OverflowError,
13193 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013194 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 }
13196
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013197 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013198 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013199
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013201 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013202 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013203 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013204}
13205
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013206static int
13207repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13208{
13209 int r;
13210 assert(count > 0);
13211 assert(PyUnicode_Check(obj));
13212 if (count > 5) {
13213 PyObject *repeated = unicode_repeat((PyUnicodeObject *) obj, count);
13214 if (repeated == NULL)
13215 return -1;
13216 r = _PyAccu_Accumulate(acc, repeated);
13217 Py_DECREF(repeated);
13218 return r;
13219 }
13220 else {
13221 do {
13222 if (_PyAccu_Accumulate(acc, obj))
13223 return -1;
13224 } while (--count);
13225 return 0;
13226 }
13227}
13228
Alexander Belopolsky40018472011-02-26 01:02:56 +000013229PyObject *
13230PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013231{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 void *fmt;
13233 int fmtkind;
13234 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013235 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013236 int r;
13237 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013238 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013239 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013240 PyObject *temp = NULL;
13241 PyObject *second = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 PyUnicodeObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013243 _PyAccu acc;
13244 static PyObject *plus, *minus, *blank, *zero, *percent;
13245
13246 if (!plus && !(plus = get_latin1_char('+')))
13247 return NULL;
13248 if (!minus && !(minus = get_latin1_char('-')))
13249 return NULL;
13250 if (!blank && !(blank = get_latin1_char(' ')))
13251 return NULL;
13252 if (!zero && !(zero = get_latin1_char('0')))
13253 return NULL;
13254 if (!percent && !(percent = get_latin1_char('%')))
13255 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013256
Guido van Rossumd57fd912000-03-10 22:53:23 +000013257 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013258 PyErr_BadInternalCall();
13259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
13262 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013263 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013264 if (_PyAccu_Init(&acc))
13265 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 fmt = PyUnicode_DATA(uformat);
13267 fmtkind = PyUnicode_KIND(uformat);
13268 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13269 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013272 arglen = PyTuple_Size(args);
13273 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274 }
13275 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 arglen = -1;
13277 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013279 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013280 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013281 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013282
13283 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013284 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013285 PyObject *nonfmt;
13286 Py_ssize_t nonfmtpos;
13287 nonfmtpos = fmtpos++;
13288 while (fmtcnt >= 0 &&
13289 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13290 fmtpos++;
13291 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013292 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013293 nonfmt = PyUnicode_Substring((PyObject *) uformat, nonfmtpos, fmtpos);
13294 if (nonfmt == NULL)
13295 goto onError;
13296 r = _PyAccu_Accumulate(&acc, nonfmt);
13297 Py_DECREF(nonfmt);
13298 if (r)
13299 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013300 }
13301 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013302 /* Got a format specifier */
13303 int flags = 0;
13304 Py_ssize_t width = -1;
13305 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013307 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 int isnumok;
13309 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013310 void *pbuf = NULL;
13311 Py_ssize_t pindex, len;
13312 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013314 fmtpos++;
13315 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13316 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013317 Py_ssize_t keylen;
13318 PyObject *key;
13319 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013320
Benjamin Peterson29060642009-01-31 22:14:21 +000013321 if (dict == NULL) {
13322 PyErr_SetString(PyExc_TypeError,
13323 "format requires a mapping");
13324 goto onError;
13325 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013326 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013327 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013328 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013329 /* Skip over balanced parentheses */
13330 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013332 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013334 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013336 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013337 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013338 if (fmtcnt < 0 || pcount > 0) {
13339 PyErr_SetString(PyExc_ValueError,
13340 "incomplete format key");
13341 goto onError;
13342 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020013343 key = PyUnicode_Substring((PyObject*)uformat,
13344 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 if (key == NULL)
13346 goto onError;
13347 if (args_owned) {
13348 Py_DECREF(args);
13349 args_owned = 0;
13350 }
13351 args = PyObject_GetItem(dict, key);
13352 Py_DECREF(key);
13353 if (args == NULL) {
13354 goto onError;
13355 }
13356 args_owned = 1;
13357 arglen = -1;
13358 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013359 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013360 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013362 case '-': flags |= F_LJUST; continue;
13363 case '+': flags |= F_SIGN; continue;
13364 case ' ': flags |= F_BLANK; continue;
13365 case '#': flags |= F_ALT; continue;
13366 case '0': flags |= F_ZERO; continue;
13367 }
13368 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013369 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013370 if (c == '*') {
13371 v = getnextarg(args, arglen, &argidx);
13372 if (v == NULL)
13373 goto onError;
13374 if (!PyLong_Check(v)) {
13375 PyErr_SetString(PyExc_TypeError,
13376 "* wants int");
13377 goto onError;
13378 }
13379 width = PyLong_AsLong(v);
13380 if (width == -1 && PyErr_Occurred())
13381 goto onError;
13382 if (width < 0) {
13383 flags |= F_LJUST;
13384 width = -width;
13385 }
13386 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013387 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 }
13389 else if (c >= '0' && c <= '9') {
13390 width = c - '0';
13391 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013392 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013393 if (c < '0' || c > '9')
13394 break;
13395 if ((width*10) / 10 != width) {
13396 PyErr_SetString(PyExc_ValueError,
13397 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013398 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 }
13400 width = width*10 + (c - '0');
13401 }
13402 }
13403 if (c == '.') {
13404 prec = 0;
13405 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013406 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 if (c == '*') {
13408 v = getnextarg(args, arglen, &argidx);
13409 if (v == NULL)
13410 goto onError;
13411 if (!PyLong_Check(v)) {
13412 PyErr_SetString(PyExc_TypeError,
13413 "* wants int");
13414 goto onError;
13415 }
13416 prec = PyLong_AsLong(v);
13417 if (prec == -1 && PyErr_Occurred())
13418 goto onError;
13419 if (prec < 0)
13420 prec = 0;
13421 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013422 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013423 }
13424 else if (c >= '0' && c <= '9') {
13425 prec = c - '0';
13426 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013427 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013428 if (c < '0' || c > '9')
13429 break;
13430 if ((prec*10) / 10 != prec) {
13431 PyErr_SetString(PyExc_ValueError,
13432 "prec too big");
13433 goto onError;
13434 }
13435 prec = prec*10 + (c - '0');
13436 }
13437 }
13438 } /* prec */
13439 if (fmtcnt >= 0) {
13440 if (c == 'h' || c == 'l' || c == 'L') {
13441 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013442 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 }
13444 }
13445 if (fmtcnt < 0) {
13446 PyErr_SetString(PyExc_ValueError,
13447 "incomplete format");
13448 goto onError;
13449 }
13450 if (c != '%') {
13451 v = getnextarg(args, arglen, &argidx);
13452 if (v == NULL)
13453 goto onError;
13454 }
13455 sign = 0;
13456 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013457 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 switch (c) {
13459
13460 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013461 _PyAccu_Accumulate(&acc, percent);
13462 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013463
13464 case 's':
13465 case 'r':
13466 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013467 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 temp = v;
13469 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013470 }
13471 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 if (c == 's')
13473 temp = PyObject_Str(v);
13474 else if (c == 'r')
13475 temp = PyObject_Repr(v);
13476 else
13477 temp = PyObject_ASCII(v);
13478 if (temp == NULL)
13479 goto onError;
13480 if (PyUnicode_Check(temp))
13481 /* nothing to do */;
13482 else {
13483 Py_DECREF(temp);
13484 PyErr_SetString(PyExc_TypeError,
13485 "%s argument has non-string str()");
13486 goto onError;
13487 }
13488 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013489 if (PyUnicode_READY(temp) == -1) {
13490 Py_CLEAR(temp);
13491 goto onError;
13492 }
13493 pbuf = PyUnicode_DATA(temp);
13494 kind = PyUnicode_KIND(temp);
13495 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 if (prec >= 0 && len > prec)
13497 len = prec;
13498 break;
13499
13500 case 'i':
13501 case 'd':
13502 case 'u':
13503 case 'o':
13504 case 'x':
13505 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013506 isnumok = 0;
13507 if (PyNumber_Check(v)) {
13508 PyObject *iobj=NULL;
13509
13510 if (PyLong_Check(v)) {
13511 iobj = v;
13512 Py_INCREF(iobj);
13513 }
13514 else {
13515 iobj = PyNumber_Long(v);
13516 }
13517 if (iobj!=NULL) {
13518 if (PyLong_Check(iobj)) {
13519 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013520 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013521 Py_DECREF(iobj);
13522 if (!temp)
13523 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 if (PyUnicode_READY(temp) == -1) {
13525 Py_CLEAR(temp);
13526 goto onError;
13527 }
13528 pbuf = PyUnicode_DATA(temp);
13529 kind = PyUnicode_KIND(temp);
13530 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 sign = 1;
13532 }
13533 else {
13534 Py_DECREF(iobj);
13535 }
13536 }
13537 }
13538 if (!isnumok) {
13539 PyErr_Format(PyExc_TypeError,
13540 "%%%c format: a number is required, "
13541 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13542 goto onError;
13543 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013544 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013546 fillobj = zero;
13547 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 break;
13549
13550 case 'e':
13551 case 'E':
13552 case 'f':
13553 case 'F':
13554 case 'g':
13555 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013556 temp = formatfloat(v, flags, prec, c);
13557 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013558 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013559 if (PyUnicode_READY(temp) == -1) {
13560 Py_CLEAR(temp);
13561 goto onError;
13562 }
13563 pbuf = PyUnicode_DATA(temp);
13564 kind = PyUnicode_KIND(temp);
13565 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013566 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013567 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013569 fillobj = zero;
13570 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013571 break;
13572
13573 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013574 {
13575 Py_UCS4 ch = formatchar(v);
13576 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013577 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013578 temp = _PyUnicode_FromUCS4(&ch, 1);
13579 if (temp == NULL)
13580 goto onError;
13581 pbuf = PyUnicode_DATA(temp);
13582 kind = PyUnicode_KIND(temp);
13583 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013584 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013585 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013586
13587 default:
13588 PyErr_Format(PyExc_ValueError,
13589 "unsupported format character '%c' (0x%x) "
13590 "at index %zd",
13591 (31<=c && c<=126) ? (char)c : '?',
13592 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013593 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013594 goto onError;
13595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013596 /* pbuf is initialized here. */
13597 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013598 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013599 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13600 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013601 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013602 pindex++;
13603 }
13604 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13605 signobj = plus;
13606 len--;
13607 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013608 }
13609 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013610 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013611 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013612 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013613 else
13614 sign = 0;
13615 }
13616 if (width < len)
13617 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013618 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013619 if (fill != ' ') {
13620 assert(signobj != NULL);
13621 if (_PyAccu_Accumulate(&acc, signobj))
13622 goto onError;
13623 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013624 if (width > len)
13625 width--;
13626 }
13627 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013629 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013630 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013631 second = get_latin1_char(
13632 PyUnicode_READ(kind, pbuf, pindex + 1));
13633 pindex += 2;
13634 if (second == NULL ||
13635 _PyAccu_Accumulate(&acc, zero) ||
13636 _PyAccu_Accumulate(&acc, second))
13637 goto onError;
13638 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013639 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013640 width -= 2;
13641 if (width < 0)
13642 width = 0;
13643 len -= 2;
13644 }
13645 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013646 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013647 if (repeat_accumulate(&acc, fillobj, width - len))
13648 goto onError;
13649 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013650 }
13651 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013652 if (sign) {
13653 assert(signobj != NULL);
13654 if (_PyAccu_Accumulate(&acc, signobj))
13655 goto onError;
13656 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013657 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013658 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13659 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013660 second = get_latin1_char(
13661 PyUnicode_READ(kind, pbuf, pindex + 1));
13662 pindex += 2;
13663 if (second == NULL ||
13664 _PyAccu_Accumulate(&acc, zero) ||
13665 _PyAccu_Accumulate(&acc, second))
13666 goto onError;
13667 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013668 }
13669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013671 if (temp != NULL) {
13672 assert(pbuf == PyUnicode_DATA(temp));
13673 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013674 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013675 else {
13676 const char *p = (const char *) pbuf;
13677 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013678 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013679 v = PyUnicode_FromKindAndData(kind, p, len);
13680 }
13681 if (v == NULL)
13682 goto onError;
13683 r = _PyAccu_Accumulate(&acc, v);
13684 Py_DECREF(v);
13685 if (r)
13686 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013687 if (width > len && repeat_accumulate(&acc, blank, width - len))
13688 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013689 if (dict && (argidx < arglen) && c != '%') {
13690 PyErr_SetString(PyExc_TypeError,
13691 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013692 goto onError;
13693 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013694 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013695 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013696 } /* until end */
13697 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 PyErr_SetString(PyExc_TypeError,
13699 "not all arguments converted during string formatting");
13700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013701 }
13702
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013703 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013704 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013705 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706 }
13707 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013708 Py_XDECREF(temp);
13709 Py_XDECREF(second);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710 return (PyObject *)result;
13711
Benjamin Peterson29060642009-01-31 22:14:21 +000013712 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013713 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013714 Py_XDECREF(temp);
13715 Py_XDECREF(second);
13716 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013717 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013718 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013719 }
13720 return NULL;
13721}
13722
Jeremy Hylton938ace62002-07-17 16:30:39 +000013723static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013724unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13725
Tim Peters6d6c1a32001-08-02 04:15:00 +000013726static PyObject *
13727unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13728{
Benjamin Peterson29060642009-01-31 22:14:21 +000013729 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013730 static char *kwlist[] = {"object", "encoding", "errors", 0};
13731 char *encoding = NULL;
13732 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013733
Benjamin Peterson14339b62009-01-31 16:36:08 +000013734 if (type != &PyUnicode_Type)
13735 return unicode_subtype_new(type, args, kwds);
13736 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013737 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013738 return NULL;
13739 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013741 if (encoding == NULL && errors == NULL)
13742 return PyObject_Str(x);
13743 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013744 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013745}
13746
Guido van Rossume023fe02001-08-30 03:12:59 +000013747static PyObject *
13748unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13749{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013750 PyUnicodeObject *unicode, *self;
13751 Py_ssize_t length, char_size;
13752 int share_wstr, share_utf8;
13753 unsigned int kind;
13754 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013755
Benjamin Peterson14339b62009-01-31 16:36:08 +000013756 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013757
13758 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13759 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013760 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013761 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013762 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013763 return NULL;
13764
13765 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13766 if (self == NULL) {
13767 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013768 return NULL;
13769 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013770 kind = PyUnicode_KIND(unicode);
13771 length = PyUnicode_GET_LENGTH(unicode);
13772
13773 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013774#ifdef Py_DEBUG
13775 _PyUnicode_HASH(self) = -1;
13776#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013777 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013778#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013779 _PyUnicode_STATE(self).interned = 0;
13780 _PyUnicode_STATE(self).kind = kind;
13781 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013782 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013783 _PyUnicode_STATE(self).ready = 1;
13784 _PyUnicode_WSTR(self) = NULL;
13785 _PyUnicode_UTF8_LENGTH(self) = 0;
13786 _PyUnicode_UTF8(self) = NULL;
13787 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013788 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013789
13790 share_utf8 = 0;
13791 share_wstr = 0;
13792 if (kind == PyUnicode_1BYTE_KIND) {
13793 char_size = 1;
13794 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13795 share_utf8 = 1;
13796 }
13797 else if (kind == PyUnicode_2BYTE_KIND) {
13798 char_size = 2;
13799 if (sizeof(wchar_t) == 2)
13800 share_wstr = 1;
13801 }
13802 else {
13803 assert(kind == PyUnicode_4BYTE_KIND);
13804 char_size = 4;
13805 if (sizeof(wchar_t) == 4)
13806 share_wstr = 1;
13807 }
13808
13809 /* Ensure we won't overflow the length. */
13810 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13811 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013813 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013814 data = PyObject_MALLOC((length + 1) * char_size);
13815 if (data == NULL) {
13816 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013817 goto onError;
13818 }
13819
Victor Stinnerc3c74152011-10-02 20:39:55 +020013820 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013821 if (share_utf8) {
13822 _PyUnicode_UTF8_LENGTH(self) = length;
13823 _PyUnicode_UTF8(self) = data;
13824 }
13825 if (share_wstr) {
13826 _PyUnicode_WSTR_LENGTH(self) = length;
13827 _PyUnicode_WSTR(self) = (wchar_t *)data;
13828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013829
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013830 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013831 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013832 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013833#ifdef Py_DEBUG
13834 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13835#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013836 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013837 return (PyObject *)self;
13838
13839onError:
13840 Py_DECREF(unicode);
13841 Py_DECREF(self);
13842 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013843}
13844
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013845PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013846 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013847\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013848Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013849encoding defaults to the current default string encoding.\n\
13850errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013851
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013852static PyObject *unicode_iter(PyObject *seq);
13853
Guido van Rossumd57fd912000-03-10 22:53:23 +000013854PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013855 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013856 "str", /* tp_name */
13857 sizeof(PyUnicodeObject), /* tp_size */
13858 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013859 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013860 (destructor)unicode_dealloc, /* tp_dealloc */
13861 0, /* tp_print */
13862 0, /* tp_getattr */
13863 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013864 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013865 unicode_repr, /* tp_repr */
13866 &unicode_as_number, /* tp_as_number */
13867 &unicode_as_sequence, /* tp_as_sequence */
13868 &unicode_as_mapping, /* tp_as_mapping */
13869 (hashfunc) unicode_hash, /* tp_hash*/
13870 0, /* tp_call*/
13871 (reprfunc) unicode_str, /* tp_str */
13872 PyObject_GenericGetAttr, /* tp_getattro */
13873 0, /* tp_setattro */
13874 0, /* tp_as_buffer */
13875 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013876 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013877 unicode_doc, /* tp_doc */
13878 0, /* tp_traverse */
13879 0, /* tp_clear */
13880 PyUnicode_RichCompare, /* tp_richcompare */
13881 0, /* tp_weaklistoffset */
13882 unicode_iter, /* tp_iter */
13883 0, /* tp_iternext */
13884 unicode_methods, /* tp_methods */
13885 0, /* tp_members */
13886 0, /* tp_getset */
13887 &PyBaseObject_Type, /* tp_base */
13888 0, /* tp_dict */
13889 0, /* tp_descr_get */
13890 0, /* tp_descr_set */
13891 0, /* tp_dictoffset */
13892 0, /* tp_init */
13893 0, /* tp_alloc */
13894 unicode_new, /* tp_new */
13895 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896};
13897
13898/* Initialize the Unicode implementation */
13899
Victor Stinner3a50e702011-10-18 21:21:00 +020013900int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013901{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013902 int i;
13903
Thomas Wouters477c8d52006-05-27 19:21:47 +000013904 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013906 0x000A, /* LINE FEED */
13907 0x000D, /* CARRIAGE RETURN */
13908 0x001C, /* FILE SEPARATOR */
13909 0x001D, /* GROUP SEPARATOR */
13910 0x001E, /* RECORD SEPARATOR */
13911 0x0085, /* NEXT LINE */
13912 0x2028, /* LINE SEPARATOR */
13913 0x2029, /* PARAGRAPH SEPARATOR */
13914 };
13915
Fred Drakee4315f52000-05-09 19:53:39 +000013916 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013917 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013918 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013919 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013920 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013921
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013922 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013923 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013924 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013925 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013926
13927 /* initialize the linebreak bloom filter */
13928 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013929 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013930 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013931
13932 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013933
13934#ifdef HAVE_MBCS
13935 winver.dwOSVersionInfoSize = sizeof(winver);
13936 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13937 PyErr_SetFromWindowsErr(0);
13938 return -1;
13939 }
13940#endif
13941 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942}
13943
13944/* Finalize the Unicode implementation */
13945
Christian Heimesa156e092008-02-16 07:38:31 +000013946int
13947PyUnicode_ClearFreeList(void)
13948{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013949 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013950}
13951
Guido van Rossumd57fd912000-03-10 22:53:23 +000013952void
Thomas Wouters78890102000-07-22 19:25:51 +000013953_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013955 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013957 Py_XDECREF(unicode_empty);
13958 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013959
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013960 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 if (unicode_latin1[i]) {
13962 Py_DECREF(unicode_latin1[i]);
13963 unicode_latin1[i] = NULL;
13964 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013965 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013966 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013967 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013968}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013969
Walter Dörwald16807132007-05-25 13:52:07 +000013970void
13971PyUnicode_InternInPlace(PyObject **p)
13972{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013973 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13974 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013975#ifdef Py_DEBUG
13976 assert(s != NULL);
13977 assert(_PyUnicode_CHECK(s));
13978#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013980 return;
13981#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013982 /* If it's a subclass, we don't really know what putting
13983 it in the interned dict might do. */
13984 if (!PyUnicode_CheckExact(s))
13985 return;
13986 if (PyUnicode_CHECK_INTERNED(s))
13987 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013988 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013989 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013990 return;
13991 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013992 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013993 if (interned == NULL) {
13994 interned = PyDict_New();
13995 if (interned == NULL) {
13996 PyErr_Clear(); /* Don't leave an exception */
13997 return;
13998 }
13999 }
14000 /* It might be that the GetItem call fails even
14001 though the key is present in the dictionary,
14002 namely when this happens during a stack overflow. */
14003 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000014004 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014005 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000014006
Benjamin Peterson29060642009-01-31 22:14:21 +000014007 if (t) {
14008 Py_INCREF(t);
14009 Py_DECREF(*p);
14010 *p = t;
14011 return;
14012 }
Walter Dörwald16807132007-05-25 13:52:07 +000014013
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 PyThreadState_GET()->recursion_critical = 1;
14015 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
14016 PyErr_Clear();
14017 PyThreadState_GET()->recursion_critical = 0;
14018 return;
14019 }
14020 PyThreadState_GET()->recursion_critical = 0;
14021 /* The two references in interned are not counted by refcnt.
14022 The deallocator will take care of this */
14023 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014024 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000014025}
14026
14027void
14028PyUnicode_InternImmortal(PyObject **p)
14029{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014030 PyUnicodeObject *u = (PyUnicodeObject *)*p;
14031
Benjamin Peterson14339b62009-01-31 16:36:08 +000014032 PyUnicode_InternInPlace(p);
14033 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014035 Py_INCREF(*p);
14036 }
Walter Dörwald16807132007-05-25 13:52:07 +000014037}
14038
14039PyObject *
14040PyUnicode_InternFromString(const char *cp)
14041{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014042 PyObject *s = PyUnicode_FromString(cp);
14043 if (s == NULL)
14044 return NULL;
14045 PyUnicode_InternInPlace(&s);
14046 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000014047}
14048
Alexander Belopolsky40018472011-02-26 01:02:56 +000014049void
14050_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000014051{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014052 PyObject *keys;
14053 PyUnicodeObject *s;
14054 Py_ssize_t i, n;
14055 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000014056
Benjamin Peterson14339b62009-01-31 16:36:08 +000014057 if (interned == NULL || !PyDict_Check(interned))
14058 return;
14059 keys = PyDict_Keys(interned);
14060 if (keys == NULL || !PyList_Check(keys)) {
14061 PyErr_Clear();
14062 return;
14063 }
Walter Dörwald16807132007-05-25 13:52:07 +000014064
Benjamin Peterson14339b62009-01-31 16:36:08 +000014065 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14066 detector, interned unicode strings are not forcibly deallocated;
14067 rather, we give them their stolen references back, and then clear
14068 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000014069
Benjamin Peterson14339b62009-01-31 16:36:08 +000014070 n = PyList_GET_SIZE(keys);
14071 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000014072 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014073 for (i = 0; i < n; i++) {
14074 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014075 if (PyUnicode_READY(s) == -1) {
14076 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014077 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020014078 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014079 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014080 case SSTATE_NOT_INTERNED:
14081 /* XXX Shouldn't happen */
14082 break;
14083 case SSTATE_INTERNED_IMMORTAL:
14084 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014085 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014086 break;
14087 case SSTATE_INTERNED_MORTAL:
14088 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014089 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014090 break;
14091 default:
14092 Py_FatalError("Inconsistent interned string state.");
14093 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014094 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014095 }
14096 fprintf(stderr, "total size of all interned strings: "
14097 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14098 "mortal/immortal\n", mortal_size, immortal_size);
14099 Py_DECREF(keys);
14100 PyDict_Clear(interned);
14101 Py_DECREF(interned);
14102 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000014103}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014104
14105
14106/********************* Unicode Iterator **************************/
14107
14108typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014109 PyObject_HEAD
14110 Py_ssize_t it_index;
14111 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014112} unicodeiterobject;
14113
14114static void
14115unicodeiter_dealloc(unicodeiterobject *it)
14116{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014117 _PyObject_GC_UNTRACK(it);
14118 Py_XDECREF(it->it_seq);
14119 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014120}
14121
14122static int
14123unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014125 Py_VISIT(it->it_seq);
14126 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014127}
14128
14129static PyObject *
14130unicodeiter_next(unicodeiterobject *it)
14131{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014132 PyUnicodeObject *seq;
14133 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014134
Benjamin Peterson14339b62009-01-31 16:36:08 +000014135 assert(it != NULL);
14136 seq = it->it_seq;
14137 if (seq == NULL)
14138 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014139 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014140
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014141 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14142 int kind = PyUnicode_KIND(seq);
14143 void *data = PyUnicode_DATA(seq);
14144 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14145 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014146 if (item != NULL)
14147 ++it->it_index;
14148 return item;
14149 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014150
Benjamin Peterson14339b62009-01-31 16:36:08 +000014151 Py_DECREF(seq);
14152 it->it_seq = NULL;
14153 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014154}
14155
14156static PyObject *
14157unicodeiter_len(unicodeiterobject *it)
14158{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014159 Py_ssize_t len = 0;
14160 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014161 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014162 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014163}
14164
14165PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14166
14167static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014168 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014169 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014170 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014171};
14172
14173PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014174 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14175 "str_iterator", /* tp_name */
14176 sizeof(unicodeiterobject), /* tp_basicsize */
14177 0, /* tp_itemsize */
14178 /* methods */
14179 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14180 0, /* tp_print */
14181 0, /* tp_getattr */
14182 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014183 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014184 0, /* tp_repr */
14185 0, /* tp_as_number */
14186 0, /* tp_as_sequence */
14187 0, /* tp_as_mapping */
14188 0, /* tp_hash */
14189 0, /* tp_call */
14190 0, /* tp_str */
14191 PyObject_GenericGetAttr, /* tp_getattro */
14192 0, /* tp_setattro */
14193 0, /* tp_as_buffer */
14194 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14195 0, /* tp_doc */
14196 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14197 0, /* tp_clear */
14198 0, /* tp_richcompare */
14199 0, /* tp_weaklistoffset */
14200 PyObject_SelfIter, /* tp_iter */
14201 (iternextfunc)unicodeiter_next, /* tp_iternext */
14202 unicodeiter_methods, /* tp_methods */
14203 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014204};
14205
14206static PyObject *
14207unicode_iter(PyObject *seq)
14208{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014209 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014210
Benjamin Peterson14339b62009-01-31 16:36:08 +000014211 if (!PyUnicode_Check(seq)) {
14212 PyErr_BadInternalCall();
14213 return NULL;
14214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014215 if (PyUnicode_READY(seq) == -1)
14216 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014217 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14218 if (it == NULL)
14219 return NULL;
14220 it->it_index = 0;
14221 Py_INCREF(seq);
14222 it->it_seq = (PyUnicodeObject *)seq;
14223 _PyObject_GC_TRACK(it);
14224 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014225}
14226
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014227#define UNIOP(x) Py_UNICODE_##x
14228#define UNIOP_t Py_UNICODE
14229#include "uniops.h"
14230#undef UNIOP
14231#undef UNIOP_t
14232#define UNIOP(x) Py_UCS4_##x
14233#define UNIOP_t Py_UCS4
14234#include "uniops.h"
14235#undef UNIOP
14236#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000014237
Victor Stinner71133ff2010-09-01 23:43:53 +000014238Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000014239PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000014240{
14241 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
Victor Stinner577db2c2011-10-11 22:12:48 +020014242 Py_UNICODE *u, *copy;
Victor Stinner71133ff2010-09-01 23:43:53 +000014243 Py_ssize_t size;
14244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014245 if (!PyUnicode_Check(unicode)) {
14246 PyErr_BadArgument();
14247 return NULL;
14248 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014249 u = PyUnicode_AsUnicode(object);
14250 if (u == NULL)
14251 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014252 /* Ensure we won't overflow the size. */
14253 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14254 PyErr_NoMemory();
14255 return NULL;
14256 }
14257 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
14258 size *= sizeof(Py_UNICODE);
14259 copy = PyMem_Malloc(size);
14260 if (copy == NULL) {
14261 PyErr_NoMemory();
14262 return NULL;
14263 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014264 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014265 return copy;
14266}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014267
Georg Brandl66c221e2010-10-14 07:04:07 +000014268/* A _string module, to export formatter_parser and formatter_field_name_split
14269 to the string.Formatter class implemented in Python. */
14270
14271static PyMethodDef _string_methods[] = {
14272 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14273 METH_O, PyDoc_STR("split the argument as a field name")},
14274 {"formatter_parser", (PyCFunction) formatter_parser,
14275 METH_O, PyDoc_STR("parse the argument as a format string")},
14276 {NULL, NULL}
14277};
14278
14279static struct PyModuleDef _string_module = {
14280 PyModuleDef_HEAD_INIT,
14281 "_string",
14282 PyDoc_STR("string helper module"),
14283 0,
14284 _string_methods,
14285 NULL,
14286 NULL,
14287 NULL,
14288 NULL
14289};
14290
14291PyMODINIT_FUNC
14292PyInit__string(void)
14293{
14294 return PyModule_Create(&_string_module);
14295}
14296
14297
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014298#ifdef __cplusplus
14299}
14300#endif