blob: bf2b32a936346d60a27ae8ccc9996b6c372e9c3f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Victor Stinnerce5faf62011-10-05 00:42:43 +020049#ifdef Py_DEBUG
50# define DONT_MAKE_RESULT_READY
51#endif
52
Guido van Rossumd57fd912000-03-10 22:53:23 +000053/* Limit for the Unicode object free list */
54
Christian Heimes2202f872008-02-06 14:31:34 +000055#define PyUnicode_MAXFREELIST 1024
Guido van Rossumd57fd912000-03-10 22:53:23 +000056
57/* Limit for the Unicode object free list stay alive optimization.
58
59 The implementation will keep allocated Unicode memory intact for
60 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000061 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Christian Heimes2202f872008-02-06 14:31:34 +000063 At worst this will result in PyUnicode_MAXFREELIST *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000065 malloc()-overhead) bytes of unused garbage.
66
67 Setting the limit to 0 effectively turns the feature off.
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069 Note: This is an experimental feature ! If you get core dumps when
70 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000071
72*/
73
Guido van Rossumfd4b9572000-04-10 13:51:10 +000074#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000075
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
86 The globals are initialized by the _PyUnicode_Init() API and should
87 not be used before calling that API.
88
89*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000090
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000091
92#ifdef __cplusplus
93extern "C" {
94#endif
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
142 _PyUnicode_Ready((PyObject *)(op))))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200144#define _PyUnicode_READY_REPLACE(p_obj) \
145 (assert(_PyUnicode_CHECK(*p_obj)), \
146 (PyUnicode_IS_READY(*p_obj) ? \
147 0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
148
Victor Stinnerc379ead2011-10-03 12:52:27 +0200149#define _PyUnicode_SHARE_UTF8(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
152 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
153#define _PyUnicode_SHARE_WSTR(op) \
154 (assert(_PyUnicode_CHECK(op)), \
155 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
156
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157/* true if the Unicode object has an allocated UTF-8 memory block
158 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200159#define _PyUnicode_HAS_UTF8_MEMORY(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (!PyUnicode_IS_COMPACT_ASCII(op) \
162 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
164
Victor Stinner03490912011-10-03 23:45:12 +0200165/* true if the Unicode object has an allocated wstr memory block
166 (not shared with other data) */
167#define _PyUnicode_HAS_WSTR_MEMORY(op) \
168 (assert(_PyUnicode_CHECK(op)), \
169 (_PyUnicode_WSTR(op) && \
170 (!PyUnicode_IS_READY(op) || \
171 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
172
Victor Stinner910337b2011-10-03 03:20:16 +0200173/* Generic helper macro to convert characters of different types.
174 from_type and to_type have to be valid type names, begin and end
175 are pointers to the source characters which should be of type
176 "from_type *". to is a pointer of type "to_type *" and points to the
177 buffer where the result characters are written to. */
178#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
179 do { \
180 const from_type *iter_; to_type *to_; \
181 for (iter_ = (begin), to_ = (to_type *)(to); \
182 iter_ < (end); \
183 ++iter_, ++to_) { \
184 *to_ = (to_type)*iter_; \
185 } \
186 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200187
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200188/* The Unicode string has been modified: reset the hash */
189#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
190
Walter Dörwald16807132007-05-25 13:52:07 +0000191/* This dictionary holds all interned unicode strings. Note that references
192 to strings in this dictionary are *not* counted in the string's ob_refcnt.
193 When the interned string reaches a refcnt of 0 the string deallocation
194 function will delete the reference from this dictionary.
195
196 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000197 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000198*/
199static PyObject *interned;
200
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000201/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200202static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000203
204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200206static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200242
Alexander Belopolsky40018472011-02-26 01:02:56 +0000243static PyObject *
244unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000245 PyObject **errorHandler,const char *encoding, const char *reason,
246 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
247 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
248
Alexander Belopolsky40018472011-02-26 01:02:56 +0000249static void
250raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300251 const char *encoding,
252 const Py_UNICODE *unicode, Py_ssize_t size,
253 Py_ssize_t startpos, Py_ssize_t endpos,
254 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000255
Christian Heimes190d79e2008-01-30 11:58:22 +0000256/* Same for linebreaks */
257static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000258 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000259/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000260/* 0x000B, * LINE TABULATION */
261/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000262/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000263 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000265/* 0x001C, * FILE SEPARATOR */
266/* 0x001D, * GROUP SEPARATOR */
267/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000268 0, 0, 0, 0, 1, 1, 1, 0,
269 0, 0, 0, 0, 0, 0, 0, 0,
270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000273
Benjamin Peterson14339b62009-01-31 16:36:08 +0000274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000282};
283
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300284/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
285 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000286Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000287PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000288{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000289#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000290 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 /* This is actually an illegal character, so it should
293 not be passed to unichr. */
294 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295#endif
296}
297
Victor Stinner910337b2011-10-03 03:20:16 +0200298#ifdef Py_DEBUG
299static int
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200300/* FIXME: use PyObject* type for op */
301_PyUnicode_CheckConsistency(void *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200302{
303 PyASCIIObject *ascii;
304 unsigned int kind;
305
306 assert(PyUnicode_Check(op));
307
308 ascii = (PyASCIIObject *)op;
309 kind = ascii->state.kind;
310
Victor Stinnera3b334d2011-10-03 13:53:37 +0200311 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200312 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200313 assert(ascii->state.ready == 1);
314 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200315 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200316 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200317 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200318
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 if (ascii->state.compact == 1) {
320 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(kind == PyUnicode_1BYTE_KIND
322 || kind == PyUnicode_2BYTE_KIND
323 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200325 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200326 assert (compact->utf8 != data);
327 } else {
328 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
329
330 data = unicode->data.any;
331 if (kind == PyUnicode_WCHAR_KIND) {
332 assert(ascii->state.compact == 0);
333 assert(ascii->state.ascii == 0);
334 assert(ascii->state.ready == 0);
335 assert(ascii->wstr != NULL);
336 assert(data == NULL);
337 assert(compact->utf8 == NULL);
338 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
339 }
340 else {
341 assert(kind == PyUnicode_1BYTE_KIND
342 || kind == PyUnicode_2BYTE_KIND
343 || kind == PyUnicode_4BYTE_KIND);
344 assert(ascii->state.compact == 0);
345 assert(ascii->state.ready == 1);
346 assert(data != NULL);
347 if (ascii->state.ascii) {
348 assert (compact->utf8 == data);
349 assert (compact->utf8_length == ascii->length);
350 }
351 else
352 assert (compact->utf8 != data);
353 }
354 }
355 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200356 if (
357#if SIZEOF_WCHAR_T == 2
358 kind == PyUnicode_2BYTE_KIND
359#else
360 kind == PyUnicode_4BYTE_KIND
361#endif
362 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200363 {
364 assert(ascii->wstr == data);
365 assert(compact->wstr_length == ascii->length);
366 } else
367 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200368 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200369
370 if (compact->utf8 == NULL)
371 assert(compact->utf8_length == 0);
372 if (ascii->wstr == NULL)
373 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200374 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200375 /* check that the best kind is used */
376 if (check_content && kind != PyUnicode_WCHAR_KIND)
377 {
378 Py_ssize_t i;
379 Py_UCS4 maxchar = 0;
380 void *data = PyUnicode_DATA(ascii);
381 for (i=0; i < ascii->length; i++)
382 {
383 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
384 if (ch > maxchar)
385 maxchar = ch;
386 }
387 if (kind == PyUnicode_1BYTE_KIND) {
388 if (ascii->state.ascii == 0)
389 assert(maxchar >= 128);
390 else
391 assert(maxchar < 128);
392 }
393 else if (kind == PyUnicode_2BYTE_KIND)
394 assert(maxchar >= 0x100);
395 else
396 assert(maxchar >= 0x10000);
397 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400398 return 1;
399}
Victor Stinner910337b2011-10-03 03:20:16 +0200400#endif
401
Thomas Wouters477c8d52006-05-27 19:21:47 +0000402/* --- Bloom Filters ----------------------------------------------------- */
403
404/* stuff to implement simple "bloom filters" for Unicode characters.
405 to keep things simple, we use a single bitmask, using the least 5
406 bits from each unicode characters as the bit index. */
407
408/* the linebreak mask is set up by Unicode_Init below */
409
Antoine Pitrouf068f942010-01-13 14:19:12 +0000410#if LONG_BIT >= 128
411#define BLOOM_WIDTH 128
412#elif LONG_BIT >= 64
413#define BLOOM_WIDTH 64
414#elif LONG_BIT >= 32
415#define BLOOM_WIDTH 32
416#else
417#error "LONG_BIT is smaller than 32"
418#endif
419
Thomas Wouters477c8d52006-05-27 19:21:47 +0000420#define BLOOM_MASK unsigned long
421
422static BLOOM_MASK bloom_linebreak;
423
Antoine Pitrouf068f942010-01-13 14:19:12 +0000424#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
425#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000426
Benjamin Peterson29060642009-01-31 22:14:21 +0000427#define BLOOM_LINEBREAK(ch) \
428 ((ch) < 128U ? ascii_linebreak[(ch)] : \
429 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000430
Alexander Belopolsky40018472011-02-26 01:02:56 +0000431Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200432make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000433{
434 /* calculate simple bloom-style bitmask for a given unicode string */
435
Antoine Pitrouf068f942010-01-13 14:19:12 +0000436 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000437 Py_ssize_t i;
438
439 mask = 0;
440 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200441 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000442
443 return mask;
444}
445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200446#define BLOOM_MEMBER(mask, chr, str) \
447 (BLOOM(mask, chr) \
448 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000449
Guido van Rossumd57fd912000-03-10 22:53:23 +0000450/* --- Unicode Object ----------------------------------------------------- */
451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200452static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200453fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200454
455Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
456 Py_ssize_t size, Py_UCS4 ch,
457 int direction)
458{
459 /* like wcschr, but doesn't stop at NULL characters */
460 Py_ssize_t i;
461 if (direction == 1) {
462 for(i = 0; i < size; i++)
463 if (PyUnicode_READ(kind, s, i) == ch)
464 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
465 }
466 else {
467 for(i = size-1; i >= 0; i--)
468 if (PyUnicode_READ(kind, s, i) == ch)
469 return (char*)s + PyUnicode_KIND_SIZE(kind, i);
470 }
471 return NULL;
472}
473
Victor Stinnerfe226c02011-10-03 03:52:20 +0200474static PyObject*
475resize_compact(PyObject *unicode, Py_ssize_t length)
476{
477 Py_ssize_t char_size;
478 Py_ssize_t struct_size;
479 Py_ssize_t new_size;
480 int share_wstr;
481
482 assert(PyUnicode_IS_READY(unicode));
483 char_size = PyUnicode_CHARACTER_SIZE(unicode);
484 if (PyUnicode_IS_COMPACT_ASCII(unicode))
485 struct_size = sizeof(PyASCIIObject);
486 else
487 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200488 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200489
490 _Py_DEC_REFTOTAL;
491 _Py_ForgetReference(unicode);
492
493 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
494 PyErr_NoMemory();
495 return NULL;
496 }
497 new_size = (struct_size + (length + 1) * char_size);
498
499 unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
500 if (unicode == NULL) {
501 PyObject_Del(unicode);
502 PyErr_NoMemory();
503 return NULL;
504 }
505 _Py_NewReference(unicode);
506 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200507 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200508 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200509 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
510 _PyUnicode_WSTR_LENGTH(unicode) = length;
511 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200512 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
513 length, 0);
514 return unicode;
515}
516
Alexander Belopolsky40018472011-02-26 01:02:56 +0000517static int
Victor Stinner95663112011-10-04 01:03:50 +0200518resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000519{
Victor Stinner95663112011-10-04 01:03:50 +0200520 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200521 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200522 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000523
Victor Stinner95663112011-10-04 01:03:50 +0200524 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200525
526 if (PyUnicode_IS_READY(unicode)) {
527 Py_ssize_t char_size;
528 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200529 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200530 void *data;
531
532 data = _PyUnicode_DATA_ANY(unicode);
533 assert(data != NULL);
534 char_size = PyUnicode_CHARACTER_SIZE(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200535 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
536 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200537 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
538 {
539 PyObject_DEL(_PyUnicode_UTF8(unicode));
540 _PyUnicode_UTF8(unicode) = NULL;
541 _PyUnicode_UTF8_LENGTH(unicode) = 0;
542 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200543
544 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
545 PyErr_NoMemory();
546 return -1;
547 }
548 new_size = (length + 1) * char_size;
549
550 data = (PyObject *)PyObject_REALLOC(data, new_size);
551 if (data == NULL) {
552 PyErr_NoMemory();
553 return -1;
554 }
555 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200556 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200557 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200558 _PyUnicode_WSTR_LENGTH(unicode) = length;
559 }
560 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200561 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200562 _PyUnicode_UTF8_LENGTH(unicode) = length;
563 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200564 _PyUnicode_LENGTH(unicode) = length;
565 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200566 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200567 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200568 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200569 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200570 }
Victor Stinner95663112011-10-04 01:03:50 +0200571 assert(_PyUnicode_WSTR(unicode) != NULL);
572
573 /* check for integer overflow */
574 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
575 PyErr_NoMemory();
576 return -1;
577 }
578 wstr = _PyUnicode_WSTR(unicode);
579 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
580 if (!wstr) {
581 PyErr_NoMemory();
582 return -1;
583 }
584 _PyUnicode_WSTR(unicode) = wstr;
585 _PyUnicode_WSTR(unicode)[length] = 0;
586 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588 return 0;
589}
590
Victor Stinnerfe226c02011-10-03 03:52:20 +0200591static PyObject*
592resize_copy(PyObject *unicode, Py_ssize_t length)
593{
594 Py_ssize_t copy_length;
595 if (PyUnicode_IS_COMPACT(unicode)) {
596 PyObject *copy;
597 assert(PyUnicode_IS_READY(unicode));
598
599 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
600 if (copy == NULL)
601 return NULL;
602
603 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
604 if (PyUnicode_CopyCharacters(copy, 0,
605 unicode, 0,
606 copy_length) < 0)
607 {
608 Py_DECREF(copy);
609 return NULL;
610 }
611 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200612 }
613 else {
Victor Stinner2fd82272011-10-03 04:06:05 +0200614 PyUnicodeObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200615 assert(_PyUnicode_WSTR(unicode) != NULL);
616 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner2fd82272011-10-03 04:06:05 +0200617 w = _PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200618 if (w == NULL)
619 return NULL;
620 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
621 copy_length = Py_MIN(copy_length, length);
622 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
623 copy_length);
624 return (PyObject*)w;
625 }
626}
627
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000629 Ux0000 terminated; some code (e.g. new_identifier)
630 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631
632 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000633 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000634
635*/
636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200637#ifdef Py_DEBUG
638int unicode_old_new_calls = 0;
639#endif
640
Alexander Belopolsky40018472011-02-26 01:02:56 +0000641static PyUnicodeObject *
642_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643{
644 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000646
Thomas Wouters477c8d52006-05-27 19:21:47 +0000647 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000648 if (length == 0 && unicode_empty != NULL) {
649 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200650 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 }
652
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000653 /* Ensure we won't overflow the size. */
654 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
655 return (PyUnicodeObject *)PyErr_NoMemory();
656 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200657 if (length < 0) {
658 PyErr_SetString(PyExc_SystemError,
659 "Negative size passed to _PyUnicode_New");
660 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 }
662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200663#ifdef Py_DEBUG
664 ++unicode_old_new_calls;
665#endif
666
667 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
668 if (unicode == NULL)
669 return NULL;
670 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
671 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
672 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000673 PyErr_NoMemory();
674 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200676
Jeremy Hyltond8082792003-09-16 19:41:39 +0000677 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000678 * the caller fails before initializing str -- unicode_resize()
679 * reads str[0], and the Keep-Alive optimization can keep memory
680 * allocated for str alive across a call to unicode_dealloc(unicode).
681 * We don't want unicode_resize to read uninitialized memory in
682 * that case.
683 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200684 _PyUnicode_WSTR(unicode)[0] = 0;
685 _PyUnicode_WSTR(unicode)[length] = 0;
686 _PyUnicode_WSTR_LENGTH(unicode) = length;
687 _PyUnicode_HASH(unicode) = -1;
688 _PyUnicode_STATE(unicode).interned = 0;
689 _PyUnicode_STATE(unicode).kind = 0;
690 _PyUnicode_STATE(unicode).compact = 0;
691 _PyUnicode_STATE(unicode).ready = 0;
692 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200693 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200694 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200695 _PyUnicode_UTF8(unicode) = NULL;
696 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000697 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000698
Benjamin Peterson29060642009-01-31 22:14:21 +0000699 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000700 /* XXX UNREF/NEWREF interface should be more symmetrical */
701 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000702 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000703 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000704 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705}
706
Victor Stinnerf42dc442011-10-02 23:33:16 +0200707static const char*
708unicode_kind_name(PyObject *unicode)
709{
Victor Stinner42dfd712011-10-03 14:41:45 +0200710 /* don't check consistency: unicode_kind_name() is called from
711 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200712 if (!PyUnicode_IS_COMPACT(unicode))
713 {
714 if (!PyUnicode_IS_READY(unicode))
715 return "wstr";
716 switch(PyUnicode_KIND(unicode))
717 {
718 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200719 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200720 return "legacy ascii";
721 else
722 return "legacy latin1";
723 case PyUnicode_2BYTE_KIND:
724 return "legacy UCS2";
725 case PyUnicode_4BYTE_KIND:
726 return "legacy UCS4";
727 default:
728 return "<legacy invalid kind>";
729 }
730 }
731 assert(PyUnicode_IS_READY(unicode));
732 switch(PyUnicode_KIND(unicode))
733 {
734 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200735 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200736 return "ascii";
737 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200738 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200739 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200740 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200741 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200742 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200743 default:
744 return "<invalid compact kind>";
745 }
746}
747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200748#ifdef Py_DEBUG
749int unicode_new_new_calls = 0;
750
751/* Functions wrapping macros for use in debugger */
752char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200753 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200754}
755
756void *_PyUnicode_compact_data(void *unicode) {
757 return _PyUnicode_COMPACT_DATA(unicode);
758}
759void *_PyUnicode_data(void *unicode){
760 printf("obj %p\n", unicode);
761 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
762 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
763 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
764 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
765 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
766 return PyUnicode_DATA(unicode);
767}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200768
769void
770_PyUnicode_Dump(PyObject *op)
771{
772 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200773 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
774 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
775 void *data;
776 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
777 if (ascii->state.compact)
778 data = (compact + 1);
779 else
780 data = unicode->data.any;
781 if (ascii->wstr == data)
782 printf("shared ");
783 printf("wstr=%p", ascii->wstr);
Victor Stinnera3b334d2011-10-03 13:53:37 +0200784 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200785 printf(" (%zu), ", compact->wstr_length);
786 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
787 printf("shared ");
788 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200789 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200790 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200791}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200792#endif
793
794PyObject *
795PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
796{
797 PyObject *obj;
798 PyCompactUnicodeObject *unicode;
799 void *data;
800 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200801 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200802 Py_ssize_t char_size;
803 Py_ssize_t struct_size;
804
805 /* Optimization for empty strings */
806 if (size == 0 && unicode_empty != NULL) {
807 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200808 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200809 }
810
811#ifdef Py_DEBUG
812 ++unicode_new_new_calls;
813#endif
814
Victor Stinner9e9d6892011-10-04 01:02:02 +0200815 is_ascii = 0;
816 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200817 struct_size = sizeof(PyCompactUnicodeObject);
818 if (maxchar < 128) {
819 kind_state = PyUnicode_1BYTE_KIND;
820 char_size = 1;
821 is_ascii = 1;
822 struct_size = sizeof(PyASCIIObject);
823 }
824 else if (maxchar < 256) {
825 kind_state = PyUnicode_1BYTE_KIND;
826 char_size = 1;
827 }
828 else if (maxchar < 65536) {
829 kind_state = PyUnicode_2BYTE_KIND;
830 char_size = 2;
831 if (sizeof(wchar_t) == 2)
832 is_sharing = 1;
833 }
834 else {
835 kind_state = PyUnicode_4BYTE_KIND;
836 char_size = 4;
837 if (sizeof(wchar_t) == 4)
838 is_sharing = 1;
839 }
840
841 /* Ensure we won't overflow the size. */
842 if (size < 0) {
843 PyErr_SetString(PyExc_SystemError,
844 "Negative size passed to PyUnicode_New");
845 return NULL;
846 }
847 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
848 return PyErr_NoMemory();
849
850 /* Duplicated allocation code from _PyObject_New() instead of a call to
851 * PyObject_New() so we are able to allocate space for the object and
852 * it's data buffer.
853 */
854 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
855 if (obj == NULL)
856 return PyErr_NoMemory();
857 obj = PyObject_INIT(obj, &PyUnicode_Type);
858 if (obj == NULL)
859 return NULL;
860
861 unicode = (PyCompactUnicodeObject *)obj;
862 if (is_ascii)
863 data = ((PyASCIIObject*)obj) + 1;
864 else
865 data = unicode + 1;
866 _PyUnicode_LENGTH(unicode) = size;
867 _PyUnicode_HASH(unicode) = -1;
868 _PyUnicode_STATE(unicode).interned = 0;
869 _PyUnicode_STATE(unicode).kind = kind_state;
870 _PyUnicode_STATE(unicode).compact = 1;
871 _PyUnicode_STATE(unicode).ready = 1;
872 _PyUnicode_STATE(unicode).ascii = is_ascii;
873 if (is_ascii) {
874 ((char*)data)[size] = 0;
875 _PyUnicode_WSTR(unicode) = NULL;
876 }
877 else if (kind_state == PyUnicode_1BYTE_KIND) {
878 ((char*)data)[size] = 0;
879 _PyUnicode_WSTR(unicode) = NULL;
880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200881 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200882 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200883 }
884 else {
885 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200886 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200887 if (kind_state == PyUnicode_2BYTE_KIND)
888 ((Py_UCS2*)data)[size] = 0;
889 else /* kind_state == PyUnicode_4BYTE_KIND */
890 ((Py_UCS4*)data)[size] = 0;
891 if (is_sharing) {
892 _PyUnicode_WSTR_LENGTH(unicode) = size;
893 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
894 }
895 else {
896 _PyUnicode_WSTR_LENGTH(unicode) = 0;
897 _PyUnicode_WSTR(unicode) = NULL;
898 }
899 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200900 assert(_PyUnicode_CheckConsistency(unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901 return obj;
902}
903
904#if SIZEOF_WCHAR_T == 2
905/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
906 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +0200907 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200908
909 This function assumes that unicode can hold one more code point than wstr
910 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +0200911static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
913 PyUnicodeObject *unicode)
914{
915 const wchar_t *iter;
916 Py_UCS4 *ucs4_out;
917
Victor Stinner910337b2011-10-03 03:20:16 +0200918 assert(unicode != NULL);
919 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
921 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
922
923 for (iter = begin; iter < end; ) {
924 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
925 _PyUnicode_GET_LENGTH(unicode)));
926 if (*iter >= 0xD800 && *iter <= 0xDBFF
927 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
928 {
929 *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
930 iter += 2;
931 }
932 else {
933 *ucs4_out++ = *iter;
934 iter++;
935 }
936 }
937 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
938 _PyUnicode_GET_LENGTH(unicode)));
939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940}
941#endif
942
Victor Stinnercd9950f2011-10-02 00:34:53 +0200943static int
944_PyUnicode_Dirty(PyObject *unicode)
945{
Victor Stinner910337b2011-10-03 03:20:16 +0200946 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +0200947 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +0200948 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +0200949 "Cannot modify a string having more than 1 reference");
950 return -1;
951 }
952 _PyUnicode_DIRTY(unicode);
953 return 0;
954}
955
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200956Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200957PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
958 PyObject *from, Py_ssize_t from_start,
959 Py_ssize_t how_many)
960{
Victor Stinnera0702ab2011-09-29 14:14:38 +0200961 unsigned int from_kind, to_kind;
962 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200963
Victor Stinnerb1536152011-09-30 02:26:10 +0200964 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
965 PyErr_BadInternalCall();
966 return -1;
967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968
969 if (PyUnicode_READY(from))
970 return -1;
971 if (PyUnicode_READY(to))
972 return -1;
973
Victor Stinnerff9e50f2011-09-28 22:17:19 +0200974 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200975 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
Victor Stinner01698042011-10-04 00:04:26 +0200976 PyErr_Format(PyExc_SystemError,
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200977 "Cannot write %zi characters at %zi "
978 "in a string of %zi characters",
979 how_many, to_start, PyUnicode_GET_LENGTH(to));
980 return -1;
981 }
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200982 if (how_many == 0)
983 return 0;
984
Victor Stinnercd9950f2011-10-02 00:34:53 +0200985 if (_PyUnicode_Dirty(to))
Victor Stinnerf5ca1a22011-09-28 23:54:59 +0200986 return -1;
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200987
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200988 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200989 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200990 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +0200991 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992
Victor Stinnerf42dc442011-10-02 23:33:16 +0200993 if (from_kind == to_kind
994 /* deny latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +0200995 && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200996 {
Victor Stinnera0702ab2011-09-29 14:14:38 +0200997 Py_MEMCPY((char*)to_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +0200998 + PyUnicode_KIND_SIZE(to_kind, to_start),
Victor Stinnera0702ab2011-09-29 14:14:38 +0200999 (char*)from_data
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001000 + PyUnicode_KIND_SIZE(from_kind, from_start),
1001 PyUnicode_KIND_SIZE(to_kind, how_many));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001002 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001003 else if (from_kind == PyUnicode_1BYTE_KIND
1004 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001005 {
1006 _PyUnicode_CONVERT_BYTES(
1007 Py_UCS1, Py_UCS2,
1008 PyUnicode_1BYTE_DATA(from) + from_start,
1009 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1010 PyUnicode_2BYTE_DATA(to) + to_start
1011 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001012 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001013 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001014 && to_kind == PyUnicode_4BYTE_KIND)
1015 {
1016 _PyUnicode_CONVERT_BYTES(
1017 Py_UCS1, Py_UCS4,
1018 PyUnicode_1BYTE_DATA(from) + from_start,
1019 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1020 PyUnicode_4BYTE_DATA(to) + to_start
1021 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001022 }
1023 else if (from_kind == PyUnicode_2BYTE_KIND
1024 && to_kind == PyUnicode_4BYTE_KIND)
1025 {
1026 _PyUnicode_CONVERT_BYTES(
1027 Py_UCS2, Py_UCS4,
1028 PyUnicode_2BYTE_DATA(from) + from_start,
1029 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1030 PyUnicode_4BYTE_DATA(to) + to_start
1031 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001032 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001033 else {
1034 int invalid_kinds;
Victor Stinnerf42dc442011-10-02 23:33:16 +02001035
1036 /* check if max_char(from substring) <= max_char(to) */
1037 if (from_kind > to_kind
1038 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001039 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001040 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001041 /* slow path to check for character overflow */
1042 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1043 Py_UCS4 ch, maxchar;
1044 Py_ssize_t i;
1045
1046 maxchar = 0;
1047 invalid_kinds = 0;
1048 for (i=0; i < how_many; i++) {
1049 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1050 if (ch > maxchar) {
1051 maxchar = ch;
1052 if (maxchar > to_maxchar) {
1053 invalid_kinds = 1;
1054 break;
1055 }
1056 }
1057 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1058 }
1059 }
1060 else
1061 invalid_kinds = 1;
1062 if (invalid_kinds) {
Victor Stinner01698042011-10-04 00:04:26 +02001063 PyErr_Format(PyExc_SystemError,
Victor Stinnerf42dc442011-10-02 23:33:16 +02001064 "Cannot copy %s characters "
1065 "into a string of %s characters",
1066 unicode_kind_name(from),
1067 unicode_kind_name(to));
Victor Stinnera0702ab2011-09-29 14:14:38 +02001068 return -1;
1069 }
1070 }
1071 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001072}
1073
Victor Stinner17222162011-09-28 22:15:37 +02001074/* Find the maximum code point and count the number of surrogate pairs so a
1075 correct string length can be computed before converting a string to UCS4.
1076 This function counts single surrogates as a character and not as a pair.
1077
1078 Return 0 on success, or -1 on error. */
1079static int
1080find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1081 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082{
1083 const wchar_t *iter;
1084
Victor Stinnerc53be962011-10-02 21:33:54 +02001085 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001086 *num_surrogates = 0;
1087 *maxchar = 0;
1088
1089 for (iter = begin; iter < end; ) {
Victor Stinnerae864852011-10-05 14:02:44 +02001090 if (*iter > *maxchar) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001091 *maxchar = *iter;
Victor Stinnerae864852011-10-05 14:02:44 +02001092#if SIZEOF_WCHAR_T != 2
1093 if (*maxchar >= 0x10000)
1094 return 0;
1095#endif
1096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001097#if SIZEOF_WCHAR_T == 2
1098 if (*iter >= 0xD800 && *iter <= 0xDBFF
1099 && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1100 {
1101 Py_UCS4 surrogate_val;
1102 surrogate_val = (((iter[0] & 0x3FF)<<10)
1103 | (iter[1] & 0x3FF)) + 0x10000;
1104 ++(*num_surrogates);
1105 if (surrogate_val > *maxchar)
1106 *maxchar = surrogate_val;
1107 iter += 2;
1108 }
1109 else
1110 iter++;
1111#else
1112 iter++;
1113#endif
1114 }
1115 return 0;
1116}
1117
1118#ifdef Py_DEBUG
1119int unicode_ready_calls = 0;
1120#endif
1121
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001122static int
1123unicode_ready(PyObject **p_obj, int replace)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124{
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001125 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 wchar_t *end;
1127 Py_UCS4 maxchar = 0;
1128 Py_ssize_t num_surrogates;
1129#if SIZEOF_WCHAR_T == 2
1130 Py_ssize_t length_wo_surrogates;
1131#endif
1132
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001133 assert(p_obj != NULL);
1134 unicode = (PyUnicodeObject *)*p_obj;
1135
Georg Brandl7597add2011-10-05 16:36:47 +02001136 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001137 strings were created using _PyObject_New() and where no canonical
1138 representation (the str field) has been set yet aka strings
1139 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001140 assert(_PyUnicode_CHECK(unicode));
1141 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001143 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001144 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001145 /* Actually, it should neither be interned nor be anything else: */
1146 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147
1148#ifdef Py_DEBUG
1149 ++unicode_ready_calls;
1150#endif
1151
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001152#ifdef Py_DEBUG
1153 assert(!replace || Py_REFCNT(unicode) == 1);
1154#else
1155 if (replace && Py_REFCNT(unicode) != 1)
1156 replace = 0;
1157#endif
1158 if (replace) {
1159 Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1160 wchar_t *wstr = _PyUnicode_WSTR(unicode);
1161 /* Optimization for empty strings */
1162 if (len == 0) {
1163 Py_INCREF(unicode_empty);
1164 Py_DECREF(*p_obj);
1165 *p_obj = unicode_empty;
1166 return 0;
1167 }
1168 if (len == 1 && wstr[0] < 256) {
1169 PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1170 if (latin1_char == NULL)
1171 return -1;
1172 Py_DECREF(*p_obj);
1173 *p_obj = latin1_char;
1174 return 0;
1175 }
1176 }
1177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001178 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001179 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001180 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182
1183 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001184 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1185 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186 PyErr_NoMemory();
1187 return -1;
1188 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001189 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001190 _PyUnicode_WSTR(unicode), end,
1191 PyUnicode_1BYTE_DATA(unicode));
1192 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1193 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1194 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1195 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001196 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001197 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001198 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199 }
1200 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001201 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001202 _PyUnicode_UTF8(unicode) = NULL;
1203 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204 }
1205 PyObject_FREE(_PyUnicode_WSTR(unicode));
1206 _PyUnicode_WSTR(unicode) = NULL;
1207 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1208 }
1209 /* In this case we might have to convert down from 4-byte native
1210 wchar_t to 2-byte unicode. */
1211 else if (maxchar < 65536) {
1212 assert(num_surrogates == 0 &&
1213 "FindMaxCharAndNumSurrogatePairs() messed up");
1214
Victor Stinner506f5922011-09-28 22:34:18 +02001215#if SIZEOF_WCHAR_T == 2
1216 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001217 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001218 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1219 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1220 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001221 _PyUnicode_UTF8(unicode) = NULL;
1222 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001223#else
1224 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001225 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001226 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001227 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001228 PyErr_NoMemory();
1229 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001230 }
Victor Stinner506f5922011-09-28 22:34:18 +02001231 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1232 _PyUnicode_WSTR(unicode), end,
1233 PyUnicode_2BYTE_DATA(unicode));
1234 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1235 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1236 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001237 _PyUnicode_UTF8(unicode) = NULL;
1238 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001239 PyObject_FREE(_PyUnicode_WSTR(unicode));
1240 _PyUnicode_WSTR(unicode) = NULL;
1241 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243 }
1244 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1245 else {
1246#if SIZEOF_WCHAR_T == 2
1247 /* in case the native representation is 2-bytes, we need to allocate a
1248 new normalized 4-byte version. */
1249 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001250 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1251 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 PyErr_NoMemory();
1253 return -1;
1254 }
1255 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1256 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001257 _PyUnicode_UTF8(unicode) = NULL;
1258 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001259 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1260 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001261 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001262 PyObject_FREE(_PyUnicode_WSTR(unicode));
1263 _PyUnicode_WSTR(unicode) = NULL;
1264 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1265#else
1266 assert(num_surrogates == 0);
1267
Victor Stinnerc3c74152011-10-02 20:39:55 +02001268 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001269 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001270 _PyUnicode_UTF8(unicode) = NULL;
1271 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001272 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1273#endif
1274 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1275 }
1276 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001277 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001278 return 0;
1279}
1280
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02001281int
1282_PyUnicode_ReadyReplace(PyObject **op)
1283{
1284 return unicode_ready(op, 1);
1285}
1286
1287int
1288_PyUnicode_Ready(PyObject *op)
1289{
1290 return unicode_ready(&op, 0);
1291}
1292
Alexander Belopolsky40018472011-02-26 01:02:56 +00001293static void
1294unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001295{
Walter Dörwald16807132007-05-25 13:52:07 +00001296 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001297 case SSTATE_NOT_INTERNED:
1298 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001299
Benjamin Peterson29060642009-01-31 22:14:21 +00001300 case SSTATE_INTERNED_MORTAL:
1301 /* revive dead object temporarily for DelItem */
1302 Py_REFCNT(unicode) = 3;
1303 if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1304 Py_FatalError(
1305 "deletion of interned string failed");
1306 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001307
Benjamin Peterson29060642009-01-31 22:14:21 +00001308 case SSTATE_INTERNED_IMMORTAL:
1309 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001310
Benjamin Peterson29060642009-01-31 22:14:21 +00001311 default:
1312 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001313 }
1314
Victor Stinner03490912011-10-03 23:45:12 +02001315 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001317 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001318 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319
1320 if (PyUnicode_IS_COMPACT(unicode)) {
1321 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 }
1323 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001324 if (_PyUnicode_DATA_ANY(unicode))
1325 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Benjamin Peterson29060642009-01-31 22:14:21 +00001326 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 }
1328}
1329
Alexander Belopolsky40018472011-02-26 01:02:56 +00001330static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001331unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001332{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001333 if (Py_REFCNT(unicode) != 1)
1334 return 0;
1335 if (PyUnicode_CHECK_INTERNED(unicode))
1336 return 0;
Benjamin Peterson7f3140e2011-10-03 19:37:29 -04001337 assert(unicode != unicode_empty);
Victor Stinner77bb47b2011-10-03 20:06:05 +02001338#ifdef Py_DEBUG
1339 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1340 && PyUnicode_GET_LENGTH(unicode) == 1)
1341 {
1342 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001343 if (ch < 256 && unicode_latin1[ch] == unicode)
1344 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001345 }
Victor Stinner77bb47b2011-10-03 20:06:05 +02001346#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001347 return 1;
1348}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001349
Victor Stinnerfe226c02011-10-03 03:52:20 +02001350static int
1351unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1352{
1353 PyObject *unicode;
1354 Py_ssize_t old_length;
1355
1356 assert(p_unicode != NULL);
1357 unicode = *p_unicode;
1358
1359 assert(unicode != NULL);
1360 assert(PyUnicode_Check(unicode));
1361 assert(0 <= length);
1362
Victor Stinner910337b2011-10-03 03:20:16 +02001363 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001364 old_length = PyUnicode_WSTR_LENGTH(unicode);
1365 else
1366 old_length = PyUnicode_GET_LENGTH(unicode);
1367 if (old_length == length)
1368 return 0;
1369
Victor Stinnerfe226c02011-10-03 03:52:20 +02001370 if (!unicode_resizable(unicode)) {
1371 PyObject *copy = resize_copy(unicode, length);
1372 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001373 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001374 Py_DECREF(*p_unicode);
1375 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001376 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001377 }
1378
Victor Stinnerfe226c02011-10-03 03:52:20 +02001379 if (PyUnicode_IS_COMPACT(unicode)) {
1380 *p_unicode = resize_compact(unicode, length);
1381 if (*p_unicode == NULL)
1382 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001383 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001384 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001385 }
1386 return resize_inplace((PyUnicodeObject*)unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001387}
1388
Alexander Belopolsky40018472011-02-26 01:02:56 +00001389int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001390PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001391{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001392 PyObject *unicode;
1393 if (p_unicode == NULL) {
1394 PyErr_BadInternalCall();
1395 return -1;
1396 }
1397 unicode = *p_unicode;
1398 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1399 || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1400 {
1401 PyErr_BadInternalCall();
1402 return -1;
1403 }
1404 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001405}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407static PyObject*
1408get_latin1_char(unsigned char ch)
1409{
Victor Stinnera464fc12011-10-02 20:39:30 +02001410 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001412 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 if (!unicode)
1414 return NULL;
1415 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001416 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001417 unicode_latin1[ch] = unicode;
1418 }
1419 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001420 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001421}
1422
Alexander Belopolsky40018472011-02-26 01:02:56 +00001423PyObject *
1424PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425{
1426 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 Py_UCS4 maxchar = 0;
1428 Py_ssize_t num_surrogates;
1429
1430 if (u == NULL)
1431 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001433 /* If the Unicode data is known at construction time, we can apply
1434 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001436 /* Optimization for empty strings */
1437 if (size == 0 && unicode_empty != NULL) {
1438 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001439 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001440 }
Tim Petersced69f82003-09-16 20:30:58 +00001441
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 /* Single character Unicode objects in the Latin-1 range are
1443 shared when using this constructor */
1444 if (size == 1 && *u < 256)
1445 return get_latin1_char((unsigned char)*u);
1446
1447 /* If not empty and not single character, copy the Unicode data
1448 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001449 if (find_maxchar_surrogates(u, u + size,
1450 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 return NULL;
1452
1453 unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1454 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455 if (!unicode)
1456 return NULL;
1457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 switch (PyUnicode_KIND(unicode)) {
1459 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001460 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001461 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1462 break;
1463 case PyUnicode_2BYTE_KIND:
1464#if Py_UNICODE_SIZE == 2
1465 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1466#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001467 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1469#endif
1470 break;
1471 case PyUnicode_4BYTE_KIND:
1472#if SIZEOF_WCHAR_T == 2
1473 /* This is the only case which has to process surrogates, thus
1474 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001475 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476#else
1477 assert(num_surrogates == 0);
1478 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1479#endif
1480 break;
1481 default:
1482 assert(0 && "Impossible state");
1483 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001485 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 return (PyObject *)unicode;
1487}
1488
Alexander Belopolsky40018472011-02-26 01:02:56 +00001489PyObject *
1490PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001491{
1492 PyUnicodeObject *unicode;
Christian Heimes33fe8092008-04-13 13:53:33 +00001493
Benjamin Peterson14339b62009-01-31 16:36:08 +00001494 if (size < 0) {
1495 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001496 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001497 return NULL;
1498 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001499
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001500 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001501 some optimizations which share commonly used objects.
1502 Also, this means the input must be UTF-8, so fall back to the
1503 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001504 if (u != NULL) {
1505
Benjamin Peterson29060642009-01-31 22:14:21 +00001506 /* Optimization for empty strings */
1507 if (size == 0 && unicode_empty != NULL) {
1508 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001509 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001510 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001511
1512 /* Single characters are shared when using this constructor.
1513 Restrict to ASCII, since the input must be UTF-8. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001514 if (size == 1 && Py_CHARMASK(*u) < 128)
1515 return get_latin1_char(Py_CHARMASK(*u));
Martin v. Löwis9c121062007-08-05 20:26:11 +00001516
1517 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001518 }
1519
Walter Dörwald55507312007-05-18 13:12:10 +00001520 unicode = _PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001521 if (!unicode)
1522 return NULL;
1523
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001524 return (PyObject *)unicode;
1525}
1526
Alexander Belopolsky40018472011-02-26 01:02:56 +00001527PyObject *
1528PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001529{
1530 size_t size = strlen(u);
1531 if (size > PY_SSIZE_T_MAX) {
1532 PyErr_SetString(PyExc_OverflowError, "input too long");
1533 return NULL;
1534 }
1535
1536 return PyUnicode_FromStringAndSize(u, size);
1537}
1538
Victor Stinnere57b1c02011-09-28 22:20:48 +02001539static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001540unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001541{
Victor Stinner0617b6e2011-10-05 23:26:01 +02001542 PyObject *res;
1543#ifdef Py_DEBUG
1544 const unsigned char *p;
1545 const unsigned char *end = s + size;
1546 for (p=s; p < end; p++) {
1547 assert(*p < 128);
1548 }
1549#endif
1550 res = PyUnicode_New(size, 127);
Victor Stinner702c7342011-10-05 13:50:52 +02001551 if (!res)
1552 return NULL;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001553 memcpy(PyUnicode_1BYTE_DATA(res), s, size);
Victor Stinner702c7342011-10-05 13:50:52 +02001554 return res;
1555}
1556
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001557static Py_UCS4
1558kind_maxchar_limit(unsigned int kind)
1559{
1560 switch(kind) {
1561 case PyUnicode_1BYTE_KIND:
1562 return 0x80;
1563 case PyUnicode_2BYTE_KIND:
1564 return 0x100;
1565 case PyUnicode_4BYTE_KIND:
1566 return 0x10000;
1567 default:
1568 assert(0 && "invalid kind");
1569 return 0x10ffff;
1570 }
1571}
1572
Victor Stinner702c7342011-10-05 13:50:52 +02001573static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001574_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001576 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001577 unsigned char max_char = 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001578 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001579
1580 assert(size >= 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 for (i = 0; i < size; i++) {
1582 if (u[i] & 0x80) {
Victor Stinnerb9275c12011-10-05 14:01:42 +02001583 max_char = 255;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001584 break;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001585 }
1586 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02001587 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001588 if (!res)
1589 return NULL;
1590 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001591 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001593}
1594
Victor Stinnere57b1c02011-09-28 22:20:48 +02001595static PyObject*
1596_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597{
1598 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001599 Py_UCS2 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001600 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001601
1602 assert(size >= 0);
1603 for (i = 0; i < size; i++) {
1604 if (u[i] > max_char) {
1605 max_char = u[i];
1606 if (max_char >= 256)
1607 break;
1608 }
1609 }
1610 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001611 if (!res)
1612 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001613 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001614 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1615 else
1616 for (i = 0; i < size; i++)
1617 PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001618 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001619 return res;
1620}
1621
Victor Stinnere57b1c02011-09-28 22:20:48 +02001622static PyObject*
1623_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001624{
1625 PyObject *res;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001626 Py_UCS4 max_char = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001627 Py_ssize_t i;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001628
1629 assert(size >= 0);
1630 for (i = 0; i < size; i++) {
1631 if (u[i] > max_char) {
1632 max_char = u[i];
1633 if (max_char >= 0x10000)
1634 break;
1635 }
1636 }
1637 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 if (!res)
1639 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001640 if (max_char >= 0x10000)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001641 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1642 else {
1643 int kind = PyUnicode_KIND(res);
1644 void *data = PyUnicode_DATA(res);
1645 for (i = 0; i < size; i++)
1646 PyUnicode_WRITE(kind, data, i, u[i]);
1647 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001648 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001649 return res;
1650}
1651
1652PyObject*
1653PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1654{
1655 switch(kind) {
1656 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001657 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001658 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001659 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001660 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001661 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001662 default:
1663 assert(0 && "invalid kind");
1664 PyErr_SetString(PyExc_SystemError, "invalid kind");
1665 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667}
1668
Victor Stinner034f6cf2011-09-30 02:26:44 +02001669PyObject*
1670PyUnicode_Copy(PyObject *unicode)
1671{
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001672 Py_ssize_t size;
1673 PyObject *copy;
1674 void *data;
1675
Victor Stinner034f6cf2011-09-30 02:26:44 +02001676 if (!PyUnicode_Check(unicode)) {
1677 PyErr_BadInternalCall();
1678 return NULL;
1679 }
1680 if (PyUnicode_READY(unicode))
1681 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001682
1683 size = PyUnicode_GET_LENGTH(unicode);
1684 copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1685 if (!copy)
1686 return NULL;
1687 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1688
1689 data = PyUnicode_DATA(unicode);
1690 switch (PyUnicode_KIND(unicode))
1691 {
1692 case PyUnicode_1BYTE_KIND:
1693 memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1694 break;
1695 case PyUnicode_2BYTE_KIND:
1696 memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1697 break;
1698 case PyUnicode_4BYTE_KIND:
1699 memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1700 break;
1701 default:
1702 assert(0);
1703 break;
1704 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001705 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001706 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001707}
1708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709
Victor Stinnerbc603d12011-10-02 01:00:40 +02001710/* Widen Unicode objects to larger buffers. Don't write terminating null
1711 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001712
1713void*
1714_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1715{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001716 Py_ssize_t len;
1717 void *result;
1718 unsigned int skind;
1719
1720 if (PyUnicode_READY(s))
1721 return NULL;
1722
1723 len = PyUnicode_GET_LENGTH(s);
1724 skind = PyUnicode_KIND(s);
1725 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02001726 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727 return NULL;
1728 }
1729 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02001730 case PyUnicode_2BYTE_KIND:
1731 result = PyMem_Malloc(len * sizeof(Py_UCS2));
1732 if (!result)
1733 return PyErr_NoMemory();
1734 assert(skind == PyUnicode_1BYTE_KIND);
1735 _PyUnicode_CONVERT_BYTES(
1736 Py_UCS1, Py_UCS2,
1737 PyUnicode_1BYTE_DATA(s),
1738 PyUnicode_1BYTE_DATA(s) + len,
1739 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001741 case PyUnicode_4BYTE_KIND:
1742 result = PyMem_Malloc(len * sizeof(Py_UCS4));
1743 if (!result)
1744 return PyErr_NoMemory();
1745 if (skind == PyUnicode_2BYTE_KIND) {
1746 _PyUnicode_CONVERT_BYTES(
1747 Py_UCS2, Py_UCS4,
1748 PyUnicode_2BYTE_DATA(s),
1749 PyUnicode_2BYTE_DATA(s) + len,
1750 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001751 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02001752 else {
1753 assert(skind == PyUnicode_1BYTE_KIND);
1754 _PyUnicode_CONVERT_BYTES(
1755 Py_UCS1, Py_UCS4,
1756 PyUnicode_1BYTE_DATA(s),
1757 PyUnicode_1BYTE_DATA(s) + len,
1758 result);
1759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02001761 default:
1762 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
Victor Stinner01698042011-10-04 00:04:26 +02001764 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 return NULL;
1766}
1767
1768static Py_UCS4*
1769as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1770 int copy_null)
1771{
1772 int kind;
1773 void *data;
1774 Py_ssize_t len, targetlen;
1775 if (PyUnicode_READY(string) == -1)
1776 return NULL;
1777 kind = PyUnicode_KIND(string);
1778 data = PyUnicode_DATA(string);
1779 len = PyUnicode_GET_LENGTH(string);
1780 targetlen = len;
1781 if (copy_null)
1782 targetlen++;
1783 if (!target) {
1784 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1785 PyErr_NoMemory();
1786 return NULL;
1787 }
1788 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1789 if (!target) {
1790 PyErr_NoMemory();
1791 return NULL;
1792 }
1793 }
1794 else {
1795 if (targetsize < targetlen) {
1796 PyErr_Format(PyExc_SystemError,
1797 "string is longer than the buffer");
1798 if (copy_null && 0 < targetsize)
1799 target[0] = 0;
1800 return NULL;
1801 }
1802 }
1803 if (kind != PyUnicode_4BYTE_KIND) {
1804 Py_ssize_t i;
1805 for (i = 0; i < len; i++)
1806 target[i] = PyUnicode_READ(kind, data, i);
1807 }
1808 else
1809 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1810 if (copy_null)
1811 target[len] = 0;
1812 return target;
1813}
1814
1815Py_UCS4*
1816PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1817 int copy_null)
1818{
1819 if (target == NULL || targetsize < 1) {
1820 PyErr_BadInternalCall();
1821 return NULL;
1822 }
1823 return as_ucs4(string, target, targetsize, copy_null);
1824}
1825
1826Py_UCS4*
1827PyUnicode_AsUCS4Copy(PyObject *string)
1828{
1829 return as_ucs4(string, NULL, 0, 1);
1830}
1831
1832#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00001833
Alexander Belopolsky40018472011-02-26 01:02:56 +00001834PyObject *
1835PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 if (w == NULL) {
Martin v. Löwis790465f2008-04-05 20:41:37 +00001838 if (size == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return PyUnicode_New(0, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001840 PyErr_BadInternalCall();
1841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 }
1843
Martin v. Löwis790465f2008-04-05 20:41:37 +00001844 if (size == -1) {
1845 size = wcslen(w);
1846 }
1847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849}
1850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00001852
Walter Dörwald346737f2007-05-31 10:44:43 +00001853static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001854makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1855 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00001856{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001857 *fmt++ = '%';
1858 if (width) {
1859 if (zeropad)
1860 *fmt++ = '0';
1861 fmt += sprintf(fmt, "%d", width);
1862 }
1863 if (precision)
1864 fmt += sprintf(fmt, ".%d", precision);
1865 if (longflag)
1866 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001867 else if (longlongflag) {
1868 /* longlongflag should only ever be nonzero on machines with
1869 HAVE_LONG_LONG defined */
1870#ifdef HAVE_LONG_LONG
1871 char *f = PY_FORMAT_LONG_LONG;
1872 while (*f)
1873 *fmt++ = *f++;
1874#else
1875 /* we shouldn't ever get here */
1876 assert(0);
1877 *fmt++ = 'l';
1878#endif
1879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00001880 else if (size_tflag) {
1881 char *f = PY_FORMAT_SIZE_T;
1882 while (*f)
1883 *fmt++ = *f++;
1884 }
1885 *fmt++ = c;
1886 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00001887}
1888
Victor Stinner96865452011-03-01 23:44:09 +00001889/* helper for PyUnicode_FromFormatV() */
1890
1891static const char*
1892parse_format_flags(const char *f,
1893 int *p_width, int *p_precision,
1894 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1895{
1896 int width, precision, longflag, longlongflag, size_tflag;
1897
1898 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1899 f++;
1900 width = 0;
1901 while (Py_ISDIGIT((unsigned)*f))
1902 width = (width*10) + *f++ - '0';
1903 precision = 0;
1904 if (*f == '.') {
1905 f++;
1906 while (Py_ISDIGIT((unsigned)*f))
1907 precision = (precision*10) + *f++ - '0';
1908 if (*f == '%') {
1909 /* "%.3%s" => f points to "3" */
1910 f--;
1911 }
1912 }
1913 if (*f == '\0') {
1914 /* bogus format "%.1" => go backward, f points to "1" */
1915 f--;
1916 }
1917 if (p_width != NULL)
1918 *p_width = width;
1919 if (p_precision != NULL)
1920 *p_precision = precision;
1921
1922 /* Handle %ld, %lu, %lld and %llu. */
1923 longflag = 0;
1924 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00001925 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00001926
1927 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00001928 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00001929 longflag = 1;
1930 ++f;
1931 }
1932#ifdef HAVE_LONG_LONG
1933 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00001934 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001935 longlongflag = 1;
1936 f += 2;
1937 }
1938#endif
1939 }
1940 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00001941 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00001942 size_tflag = 1;
1943 ++f;
1944 }
1945 if (p_longflag != NULL)
1946 *p_longflag = longflag;
1947 if (p_longlongflag != NULL)
1948 *p_longlongflag = longlongflag;
1949 if (p_size_tflag != NULL)
1950 *p_size_tflag = size_tflag;
1951 return f;
1952}
1953
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001954/* maximum number of characters required for output of %ld. 21 characters
1955 allows for 64-bit integers (in decimal) and an optional sign. */
1956#define MAX_LONG_CHARS 21
1957/* maximum number of characters required for output of %lld.
1958 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1959 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
1960#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1961
Walter Dörwaldd2034312007-05-18 16:29:38 +00001962PyObject *
1963PyUnicode_FromFormatV(const char *format, va_list vargs)
1964{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001965 va_list count;
1966 Py_ssize_t callcount = 0;
1967 PyObject **callresults = NULL;
1968 PyObject **callresult = NULL;
1969 Py_ssize_t n = 0;
1970 int width = 0;
1971 int precision = 0;
1972 int zeropad;
1973 const char* f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001974 PyUnicodeObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001975 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00001976 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001977 Py_UCS4 maxchar = 127; /* result is ASCII by default */
1978 Py_UCS4 argmaxchar;
1979 Py_ssize_t numbersize = 0;
1980 char *numberresults = NULL;
1981 char *numberresult = NULL;
1982 Py_ssize_t i;
1983 int kind;
1984 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00001985
Victor Stinner4a2b7a12010-08-13 14:03:48 +00001986 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00001987 /* step 1: count the number of %S/%R/%A/%s format specifications
1988 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1989 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02001991 * also estimate a upper bound for all the number formats in the string,
1992 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00001994 for (f = format; *f; f++) {
1995 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00001996 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001997 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1998 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1999 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2000 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002001
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002003#ifdef HAVE_LONG_LONG
2004 if (longlongflag) {
2005 if (width < MAX_LONG_LONG_CHARS)
2006 width = MAX_LONG_LONG_CHARS;
2007 }
2008 else
2009#endif
2010 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2011 including sign. Decimal takes the most space. This
2012 isn't enough for octal. If a width is specified we
2013 need more (which we allocate later). */
2014 if (width < MAX_LONG_CHARS)
2015 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016
2017 /* account for the size + '\0' to separate numbers
2018 inside of the numberresults buffer */
2019 numbersize += (width + 1);
2020 }
2021 }
2022 else if ((unsigned char)*f > 127) {
2023 PyErr_Format(PyExc_ValueError,
2024 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2025 "string, got a non-ASCII byte: 0x%02x",
2026 (unsigned char)*f);
2027 return NULL;
2028 }
2029 }
2030 /* step 2: allocate memory for the results of
2031 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2032 if (callcount) {
2033 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2034 if (!callresults) {
2035 PyErr_NoMemory();
2036 return NULL;
2037 }
2038 callresult = callresults;
2039 }
2040 /* step 2.5: allocate memory for the results of formating numbers */
2041 if (numbersize) {
2042 numberresults = PyObject_Malloc(numbersize);
2043 if (!numberresults) {
2044 PyErr_NoMemory();
2045 goto fail;
2046 }
2047 numberresult = numberresults;
2048 }
2049
2050 /* step 3: format numbers and figure out how large a buffer we need */
2051 for (f = format; *f; f++) {
2052 if (*f == '%') {
2053 const char* p;
2054 int longflag;
2055 int longlongflag;
2056 int size_tflag;
2057 int numprinted;
2058
2059 p = f;
2060 zeropad = (f[1] == '0');
2061 f = parse_format_flags(f, &width, &precision,
2062 &longflag, &longlongflag, &size_tflag);
2063 switch (*f) {
2064 case 'c':
2065 {
2066 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002067 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002068 n++;
2069 break;
2070 }
2071 case '%':
2072 n++;
2073 break;
2074 case 'i':
2075 case 'd':
2076 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2077 width, precision, *f);
2078 if (longflag)
2079 numprinted = sprintf(numberresult, fmt,
2080 va_arg(count, long));
2081#ifdef HAVE_LONG_LONG
2082 else if (longlongflag)
2083 numprinted = sprintf(numberresult, fmt,
2084 va_arg(count, PY_LONG_LONG));
2085#endif
2086 else if (size_tflag)
2087 numprinted = sprintf(numberresult, fmt,
2088 va_arg(count, Py_ssize_t));
2089 else
2090 numprinted = sprintf(numberresult, fmt,
2091 va_arg(count, int));
2092 n += numprinted;
2093 /* advance by +1 to skip over the '\0' */
2094 numberresult += (numprinted + 1);
2095 assert(*(numberresult - 1) == '\0');
2096 assert(*(numberresult - 2) != '\0');
2097 assert(numprinted >= 0);
2098 assert(numberresult <= numberresults + numbersize);
2099 break;
2100 case 'u':
2101 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2102 width, precision, 'u');
2103 if (longflag)
2104 numprinted = sprintf(numberresult, fmt,
2105 va_arg(count, unsigned long));
2106#ifdef HAVE_LONG_LONG
2107 else if (longlongflag)
2108 numprinted = sprintf(numberresult, fmt,
2109 va_arg(count, unsigned PY_LONG_LONG));
2110#endif
2111 else if (size_tflag)
2112 numprinted = sprintf(numberresult, fmt,
2113 va_arg(count, size_t));
2114 else
2115 numprinted = sprintf(numberresult, fmt,
2116 va_arg(count, unsigned int));
2117 n += numprinted;
2118 numberresult += (numprinted + 1);
2119 assert(*(numberresult - 1) == '\0');
2120 assert(*(numberresult - 2) != '\0');
2121 assert(numprinted >= 0);
2122 assert(numberresult <= numberresults + numbersize);
2123 break;
2124 case 'x':
2125 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2126 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2127 n += numprinted;
2128 numberresult += (numprinted + 1);
2129 assert(*(numberresult - 1) == '\0');
2130 assert(*(numberresult - 2) != '\0');
2131 assert(numprinted >= 0);
2132 assert(numberresult <= numberresults + numbersize);
2133 break;
2134 case 'p':
2135 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2136 /* %p is ill-defined: ensure leading 0x. */
2137 if (numberresult[1] == 'X')
2138 numberresult[1] = 'x';
2139 else if (numberresult[1] != 'x') {
2140 memmove(numberresult + 2, numberresult,
2141 strlen(numberresult) + 1);
2142 numberresult[0] = '0';
2143 numberresult[1] = 'x';
2144 numprinted += 2;
2145 }
2146 n += numprinted;
2147 numberresult += (numprinted + 1);
2148 assert(*(numberresult - 1) == '\0');
2149 assert(*(numberresult - 2) != '\0');
2150 assert(numprinted >= 0);
2151 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 break;
2153 case 's':
2154 {
2155 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002156 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002157 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2158 if (!str)
2159 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002160 /* since PyUnicode_DecodeUTF8 returns already flexible
2161 unicode objects, there is no need to call ready on them */
2162 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002163 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002164 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002165 /* Remember the str and switch to the next slot */
2166 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002167 break;
2168 }
2169 case 'U':
2170 {
2171 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002172 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 if (PyUnicode_READY(obj) == -1)
2174 goto fail;
2175 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002176 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002178 break;
2179 }
2180 case 'V':
2181 {
2182 PyObject *obj = va_arg(count, PyObject *);
2183 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002184 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002185 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002186 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002187 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 if (PyUnicode_READY(obj) == -1)
2189 goto fail;
2190 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002191 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002193 *callresult++ = NULL;
2194 }
2195 else {
2196 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2197 if (!str_obj)
2198 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002199 if (PyUnicode_READY(str_obj)) {
2200 Py_DECREF(str_obj);
2201 goto fail;
2202 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002204 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002206 *callresult++ = str_obj;
2207 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002208 break;
2209 }
2210 case 'S':
2211 {
2212 PyObject *obj = va_arg(count, PyObject *);
2213 PyObject *str;
2214 assert(obj);
2215 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002217 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002219 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 /* Remember the str and switch to the next slot */
2222 *callresult++ = str;
2223 break;
2224 }
2225 case 'R':
2226 {
2227 PyObject *obj = va_arg(count, PyObject *);
2228 PyObject *repr;
2229 assert(obj);
2230 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002231 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002232 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002233 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002234 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002236 /* Remember the repr and switch to the next slot */
2237 *callresult++ = repr;
2238 break;
2239 }
2240 case 'A':
2241 {
2242 PyObject *obj = va_arg(count, PyObject *);
2243 PyObject *ascii;
2244 assert(obj);
2245 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002246 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002247 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002249 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002250 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002251 /* Remember the repr and switch to the next slot */
2252 *callresult++ = ascii;
2253 break;
2254 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002255 default:
2256 /* if we stumble upon an unknown
2257 formatting code, copy the rest of
2258 the format string to the output
2259 string. (we cannot just skip the
2260 code, since there's no way to know
2261 what's in the argument list) */
2262 n += strlen(p);
2263 goto expand;
2264 }
2265 } else
2266 n++;
2267 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002268 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002269 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002270 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 we don't have to resize the string.
2272 There can be no errors beyond this point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002274 if (!string)
2275 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002276 kind = PyUnicode_KIND(string);
2277 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002278 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002280
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002282 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002283 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002284
2285 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2287 /* checking for == because the last argument could be a empty
2288 string, which causes i to point to end, the assert at the end of
2289 the loop */
2290 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002291
Benjamin Peterson14339b62009-01-31 16:36:08 +00002292 switch (*f) {
2293 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002294 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295 const int ordinal = va_arg(vargs, int);
2296 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002297 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002298 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002299 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002300 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002301 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002302 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 case 'p':
2304 /* unused, since we already have the result */
2305 if (*f == 'p')
2306 (void) va_arg(vargs, void *);
2307 else
2308 (void) va_arg(vargs, int);
2309 /* extract the result from numberresults and append. */
2310 for (; *numberresult; ++i, ++numberresult)
2311 PyUnicode_WRITE(kind, data, i, *numberresult);
2312 /* skip over the separating '\0' */
2313 assert(*numberresult == '\0');
2314 numberresult++;
2315 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002316 break;
2317 case 's':
2318 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002319 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002321 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322 size = PyUnicode_GET_LENGTH(*callresult);
2323 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002324 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2325 *callresult, 0,
2326 size) < 0)
2327 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002328 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002329 /* We're done with the unicode()/repr() => forget it */
2330 Py_DECREF(*callresult);
2331 /* switch to next unicode()/repr() result */
2332 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002333 break;
2334 }
2335 case 'U':
2336 {
2337 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 Py_ssize_t size;
2339 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2340 size = PyUnicode_GET_LENGTH(obj);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002341 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2342 obj, 0,
2343 size) < 0)
2344 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002345 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002346 break;
2347 }
2348 case 'V':
2349 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002351 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002352 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002353 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002354 size = PyUnicode_GET_LENGTH(obj);
2355 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002356 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2357 obj, 0,
2358 size) < 0)
2359 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002361 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002362 size = PyUnicode_GET_LENGTH(*callresult);
2363 assert(PyUnicode_KIND(*callresult) <=
2364 PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002365 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2366 *callresult,
2367 0, size) < 0)
2368 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002370 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002371 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002372 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002373 break;
2374 }
2375 case 'S':
2376 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002377 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002378 {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002379 /* unused, since we already have the result */
2380 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002381 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02002382 if (PyUnicode_CopyCharacters((PyObject*)string, i,
2383 *callresult, 0,
2384 PyUnicode_GET_LENGTH(*callresult)) < 0)
2385 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 i += PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002387 /* We're done with the unicode()/repr() => forget it */
2388 Py_DECREF(*callresult);
2389 /* switch to next unicode()/repr() result */
2390 ++callresult;
2391 break;
2392 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002393 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002394 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002395 break;
2396 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002397 for (; *p; ++p, ++i)
2398 PyUnicode_WRITE(kind, data, i, *p);
2399 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002400 goto end;
2401 }
Victor Stinner1205f272010-09-11 00:54:47 +00002402 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002403 else {
2404 assert(i < PyUnicode_GET_LENGTH(string));
2405 PyUnicode_WRITE(kind, data, i++, *f);
2406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002408 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002409
Benjamin Peterson29060642009-01-31 22:14:21 +00002410 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002411 if (callresults)
2412 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002413 if (numberresults)
2414 PyObject_Free(numberresults);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002415 assert(_PyUnicode_CheckConsistency(string, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 return (PyObject *)string;
Benjamin Peterson29060642009-01-31 22:14:21 +00002417 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002418 if (callresults) {
2419 PyObject **callresult2 = callresults;
2420 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002421 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002422 ++callresult2;
2423 }
2424 PyObject_Free(callresults);
2425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002426 if (numberresults)
2427 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002428 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002429}
2430
Walter Dörwaldd2034312007-05-18 16:29:38 +00002431PyObject *
2432PyUnicode_FromFormat(const char *format, ...)
2433{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002434 PyObject* ret;
2435 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002436
2437#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002438 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002439#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002440 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002441#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002442 ret = PyUnicode_FromFormatV(format, vargs);
2443 va_end(vargs);
2444 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002445}
2446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447#ifdef HAVE_WCHAR_H
2448
Victor Stinner5593d8a2010-10-02 11:11:27 +00002449/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2450 convert a Unicode object to a wide character string.
2451
Victor Stinnerd88d9832011-09-06 02:00:05 +02002452 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002453 character) required to convert the unicode object. Ignore size argument.
2454
Victor Stinnerd88d9832011-09-06 02:00:05 +02002455 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002456 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002457 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002458static Py_ssize_t
Victor Stinner137c34c2010-09-29 10:25:54 +00002459unicode_aswidechar(PyUnicodeObject *unicode,
2460 wchar_t *w,
2461 Py_ssize_t size)
2462{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002463 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464 const wchar_t *wstr;
2465
2466 wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2467 if (wstr == NULL)
2468 return -1;
2469
Victor Stinner5593d8a2010-10-02 11:11:27 +00002470 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002471 if (size > res)
2472 size = res + 1;
2473 else
2474 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002476 return res;
2477 }
2478 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002479 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002480}
2481
2482Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002483PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002484 wchar_t *w,
2485 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486{
2487 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002488 PyErr_BadInternalCall();
2489 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002490 }
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002491 return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
Victor Stinner137c34c2010-09-29 10:25:54 +00002494wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002495PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002496 Py_ssize_t *size)
2497{
2498 wchar_t* buffer;
2499 Py_ssize_t buflen;
2500
2501 if (unicode == NULL) {
2502 PyErr_BadInternalCall();
2503 return NULL;
2504 }
2505
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002506 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 if (buflen == -1)
2508 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002509 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002510 PyErr_NoMemory();
2511 return NULL;
2512 }
2513
Victor Stinner137c34c2010-09-29 10:25:54 +00002514 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2515 if (buffer == NULL) {
2516 PyErr_NoMemory();
2517 return NULL;
2518 }
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002519 buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 if (buflen == -1)
2521 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002522 if (size != NULL)
2523 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002524 return buffer;
2525}
2526
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002528
Alexander Belopolsky40018472011-02-26 01:02:56 +00002529PyObject *
2530PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002532 PyObject *v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002533 if (ordinal < 0 || ordinal > 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002534 PyErr_SetString(PyExc_ValueError,
2535 "chr() arg not in range(0x110000)");
2536 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002537 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002538
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 if (ordinal < 256)
2540 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 v = PyUnicode_New(1, ordinal);
2543 if (v == NULL)
2544 return NULL;
2545 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002546 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002548}
2549
Alexander Belopolsky40018472011-02-26 01:02:56 +00002550PyObject *
2551PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002552{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002553 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002554 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002555 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002556 if (PyUnicode_READY(obj))
2557 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002558 Py_INCREF(obj);
2559 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002560 }
2561 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002562 /* For a Unicode subtype that's not a Unicode object,
2563 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002564 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002565 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002566 PyErr_Format(PyExc_TypeError,
2567 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002568 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002569 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002570}
2571
Alexander Belopolsky40018472011-02-26 01:02:56 +00002572PyObject *
2573PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002574 const char *encoding,
2575 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002576{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002577 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002578 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002579
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002581 PyErr_BadInternalCall();
2582 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002584
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002585 /* Decoding bytes objects is the most common case and should be fast */
2586 if (PyBytes_Check(obj)) {
2587 if (PyBytes_GET_SIZE(obj) == 0) {
2588 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002589 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002590 }
2591 else {
2592 v = PyUnicode_Decode(
2593 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2594 encoding, errors);
2595 }
2596 return v;
2597 }
2598
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002599 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002600 PyErr_SetString(PyExc_TypeError,
2601 "decoding str is not supported");
2602 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002603 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002604
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002605 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2606 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2607 PyErr_Format(PyExc_TypeError,
2608 "coercing to str: need bytes, bytearray "
2609 "or buffer-like object, %.80s found",
2610 Py_TYPE(obj)->tp_name);
2611 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002612 }
Tim Petersced69f82003-09-16 20:30:58 +00002613
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002614 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002615 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002616 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 }
Tim Petersced69f82003-09-16 20:30:58 +00002618 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002619 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002620
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002621 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002622 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002623}
2624
Victor Stinner600d3be2010-06-10 12:00:55 +00002625/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002626 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2627 1 on success. */
2628static int
2629normalize_encoding(const char *encoding,
2630 char *lower,
2631 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002632{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002633 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002634 char *l;
2635 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002636
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002637 e = encoding;
2638 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002639 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002640 while (*e) {
2641 if (l == l_end)
2642 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002643 if (Py_ISUPPER(*e)) {
2644 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002645 }
2646 else if (*e == '_') {
2647 *l++ = '-';
2648 e++;
2649 }
2650 else {
2651 *l++ = *e++;
2652 }
2653 }
2654 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002655 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002656}
2657
Alexander Belopolsky40018472011-02-26 01:02:56 +00002658PyObject *
2659PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002660 Py_ssize_t size,
2661 const char *encoding,
2662 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002663{
2664 PyObject *buffer = NULL, *unicode;
2665 Py_buffer info;
2666 char lower[11]; /* Enough for any encoding shortcut */
2667
2668 if (encoding == NULL)
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002669 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +00002670
2671 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002672 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002673 if ((strcmp(lower, "utf-8") == 0) ||
2674 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002675 return PyUnicode_DecodeUTF8(s, size, errors);
2676 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002677 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002678 (strcmp(lower, "iso-8859-1") == 0))
2679 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002680#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002681 else if (strcmp(lower, "mbcs") == 0)
2682 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002683#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002684 else if (strcmp(lower, "ascii") == 0)
2685 return PyUnicode_DecodeASCII(s, size, errors);
2686 else if (strcmp(lower, "utf-16") == 0)
2687 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2688 else if (strcmp(lower, "utf-32") == 0)
2689 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002691
2692 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002693 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002694 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002695 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002696 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 if (buffer == NULL)
2698 goto onError;
2699 unicode = PyCodec_Decode(buffer, encoding, errors);
2700 if (unicode == NULL)
2701 goto onError;
2702 if (!PyUnicode_Check(unicode)) {
2703 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002704 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002705 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 Py_DECREF(unicode);
2707 goto onError;
2708 }
2709 Py_DECREF(buffer);
Victor Stinner17efeed2011-10-04 20:05:46 +02002710#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02002711 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002712 Py_DECREF(unicode);
2713 return NULL;
2714 }
Victor Stinner17efeed2011-10-04 20:05:46 +02002715#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002716 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002718
Benjamin Peterson29060642009-01-31 22:14:21 +00002719 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002720 Py_XDECREF(buffer);
2721 return NULL;
2722}
2723
Alexander Belopolsky40018472011-02-26 01:02:56 +00002724PyObject *
2725PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002726 const char *encoding,
2727 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002728{
2729 PyObject *v;
2730
2731 if (!PyUnicode_Check(unicode)) {
2732 PyErr_BadArgument();
2733 goto onError;
2734 }
2735
2736 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002737 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002738
2739 /* Decode via the codec registry */
2740 v = PyCodec_Decode(unicode, encoding, errors);
2741 if (v == NULL)
2742 goto onError;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002743 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002744 return v;
2745
Benjamin Peterson29060642009-01-31 22:14:21 +00002746 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002747 return NULL;
2748}
2749
Alexander Belopolsky40018472011-02-26 01:02:56 +00002750PyObject *
2751PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002752 const char *encoding,
2753 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002754{
2755 PyObject *v;
2756
2757 if (!PyUnicode_Check(unicode)) {
2758 PyErr_BadArgument();
2759 goto onError;
2760 }
2761
2762 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002763 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002764
2765 /* Decode via the codec registry */
2766 v = PyCodec_Decode(unicode, encoding, errors);
2767 if (v == NULL)
2768 goto onError;
2769 if (!PyUnicode_Check(v)) {
2770 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002771 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002772 Py_TYPE(v)->tp_name);
2773 Py_DECREF(v);
2774 goto onError;
2775 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002776 assert(_PyUnicode_CheckConsistency(v, 1));
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002777 return v;
2778
Benjamin Peterson29060642009-01-31 22:14:21 +00002779 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002780 return NULL;
2781}
2782
Alexander Belopolsky40018472011-02-26 01:02:56 +00002783PyObject *
2784PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002785 Py_ssize_t size,
2786 const char *encoding,
2787 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788{
2789 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00002790
Guido van Rossumd57fd912000-03-10 22:53:23 +00002791 unicode = PyUnicode_FromUnicode(s, size);
2792 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002793 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002794 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2795 Py_DECREF(unicode);
2796 return v;
2797}
2798
Alexander Belopolsky40018472011-02-26 01:02:56 +00002799PyObject *
2800PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002801 const char *encoding,
2802 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002803{
2804 PyObject *v;
2805
2806 if (!PyUnicode_Check(unicode)) {
2807 PyErr_BadArgument();
2808 goto onError;
2809 }
2810
2811 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002812 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002813
2814 /* Encode via the codec registry */
2815 v = PyCodec_Encode(unicode, encoding, errors);
2816 if (v == NULL)
2817 goto onError;
2818 return v;
2819
Benjamin Peterson29060642009-01-31 22:14:21 +00002820 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002821 return NULL;
2822}
2823
Victor Stinnerad158722010-10-27 00:25:46 +00002824PyObject *
2825PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00002826{
Victor Stinner99b95382011-07-04 14:23:54 +02002827#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00002828 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2829 PyUnicode_GET_SIZE(unicode),
2830 NULL);
2831#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002832 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00002833#else
Victor Stinner793b5312011-04-27 00:24:21 +02002834 PyInterpreterState *interp = PyThreadState_GET()->interp;
2835 /* Bootstrap check: if the filesystem codec is implemented in Python, we
2836 cannot use it to encode and decode filenames before it is loaded. Load
2837 the Python codec requires to encode at least its own filename. Use the C
2838 version of the locale codec until the codec registry is initialized and
2839 the Python codec is loaded.
2840
2841 Py_FileSystemDefaultEncoding is shared between all interpreters, we
2842 cannot only rely on it: check also interp->fscodec_initialized for
2843 subinterpreters. */
2844 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00002845 return PyUnicode_AsEncodedString(unicode,
2846 Py_FileSystemDefaultEncoding,
2847 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00002848 }
2849 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002850 /* locale encoding with surrogateescape */
2851 wchar_t *wchar;
2852 char *bytes;
2853 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00002854 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002855
2856 wchar = PyUnicode_AsWideCharString(unicode, NULL);
2857 if (wchar == NULL)
2858 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002859 bytes = _Py_wchar2char(wchar, &error_pos);
2860 if (bytes == NULL) {
2861 if (error_pos != (size_t)-1) {
2862 char *errmsg = strerror(errno);
2863 PyObject *exc = NULL;
2864 if (errmsg == NULL)
2865 errmsg = "Py_wchar2char() failed";
2866 raise_encode_exception(&exc,
2867 "filesystemencoding",
2868 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2869 error_pos, error_pos+1,
2870 errmsg);
2871 Py_XDECREF(exc);
2872 }
2873 else
2874 PyErr_NoMemory();
2875 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002876 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00002877 }
2878 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00002879
2880 bytes_obj = PyBytes_FromString(bytes);
2881 PyMem_Free(bytes);
2882 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00002883 }
Victor Stinnerad158722010-10-27 00:25:46 +00002884#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00002885}
2886
Alexander Belopolsky40018472011-02-26 01:02:56 +00002887PyObject *
2888PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002889 const char *encoding,
2890 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891{
2892 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00002893 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00002894
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 if (!PyUnicode_Check(unicode)) {
2896 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 }
Fred Drakee4315f52000-05-09 19:53:39 +00002899
Victor Stinner2f283c22011-03-02 01:21:46 +00002900 if (encoding == NULL) {
2901 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002902 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002903 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002904 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner2f283c22011-03-02 01:21:46 +00002905 }
Fred Drakee4315f52000-05-09 19:53:39 +00002906
2907 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002908 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002909 if ((strcmp(lower, "utf-8") == 0) ||
2910 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00002911 {
Victor Stinner2f283c22011-03-02 01:21:46 +00002912 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002913 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00002914 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002915 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00002916 }
Victor Stinner37296e82010-06-10 13:36:23 +00002917 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002918 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002919 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002920 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002921#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002922 else if (strcmp(lower, "mbcs") == 0)
2923 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2924 PyUnicode_GET_SIZE(unicode),
2925 errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002926#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002927 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002928 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00002929 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930
2931 /* Encode via the codec registry */
2932 v = PyCodec_Encode(unicode, encoding, errors);
2933 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002934 return NULL;
2935
2936 /* The normal path */
2937 if (PyBytes_Check(v))
2938 return v;
2939
2940 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002941 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002942 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002943 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002944
2945 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2946 "encoder %s returned bytearray instead of bytes",
2947 encoding);
2948 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002949 Py_DECREF(v);
2950 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002951 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002952
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00002953 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2954 Py_DECREF(v);
2955 return b;
2956 }
2957
2958 PyErr_Format(PyExc_TypeError,
2959 "encoder did not return a bytes object (type=%.400s)",
2960 Py_TYPE(v)->tp_name);
2961 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002962 return NULL;
2963}
2964
Alexander Belopolsky40018472011-02-26 01:02:56 +00002965PyObject *
2966PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002967 const char *encoding,
2968 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002969{
2970 PyObject *v;
2971
2972 if (!PyUnicode_Check(unicode)) {
2973 PyErr_BadArgument();
2974 goto onError;
2975 }
2976
2977 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002978 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002979
2980 /* Encode via the codec registry */
2981 v = PyCodec_Encode(unicode, encoding, errors);
2982 if (v == NULL)
2983 goto onError;
2984 if (!PyUnicode_Check(v)) {
2985 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002986 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00002987 Py_TYPE(v)->tp_name);
2988 Py_DECREF(v);
2989 goto onError;
2990 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 return v;
Tim Petersced69f82003-09-16 20:30:58 +00002992
Benjamin Peterson29060642009-01-31 22:14:21 +00002993 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 return NULL;
2995}
2996
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002997PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00002998PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00002999 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003000 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3001}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003002
Christian Heimes5894ba72007-11-04 11:43:14 +00003003PyObject*
3004PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3005{
Victor Stinner99b95382011-07-04 14:23:54 +02003006#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003007 return PyUnicode_DecodeMBCS(s, size, NULL);
3008#elif defined(__APPLE__)
3009 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3010#else
Victor Stinner793b5312011-04-27 00:24:21 +02003011 PyInterpreterState *interp = PyThreadState_GET()->interp;
3012 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3013 cannot use it to encode and decode filenames before it is loaded. Load
3014 the Python codec requires to encode at least its own filename. Use the C
3015 version of the locale codec until the codec registry is initialized and
3016 the Python codec is loaded.
3017
3018 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3019 cannot only rely on it: check also interp->fscodec_initialized for
3020 subinterpreters. */
3021 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003022 return PyUnicode_Decode(s, size,
3023 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003024 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003025 }
3026 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003027 /* locale encoding with surrogateescape */
3028 wchar_t *wchar;
3029 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003030 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003031
3032 if (s[size] != '\0' || size != strlen(s)) {
3033 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3034 return NULL;
3035 }
3036
Victor Stinner168e1172010-10-16 23:16:16 +00003037 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003038 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003039 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003040
Victor Stinner168e1172010-10-16 23:16:16 +00003041 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003042 PyMem_Free(wchar);
3043 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003044 }
Victor Stinnerad158722010-10-27 00:25:46 +00003045#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003046}
3047
Martin v. Löwis011e8422009-05-05 04:43:17 +00003048
3049int
3050PyUnicode_FSConverter(PyObject* arg, void* addr)
3051{
3052 PyObject *output = NULL;
3053 Py_ssize_t size;
3054 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003055 if (arg == NULL) {
3056 Py_DECREF(*(PyObject**)addr);
3057 return 1;
3058 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003059 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003060 output = arg;
3061 Py_INCREF(output);
3062 }
3063 else {
3064 arg = PyUnicode_FromObject(arg);
3065 if (!arg)
3066 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003067 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003068 Py_DECREF(arg);
3069 if (!output)
3070 return 0;
3071 if (!PyBytes_Check(output)) {
3072 Py_DECREF(output);
3073 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3074 return 0;
3075 }
3076 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003077 size = PyBytes_GET_SIZE(output);
3078 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003079 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003080 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003081 Py_DECREF(output);
3082 return 0;
3083 }
3084 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003085 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003086}
3087
3088
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003089int
3090PyUnicode_FSDecoder(PyObject* arg, void* addr)
3091{
3092 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003093 if (arg == NULL) {
3094 Py_DECREF(*(PyObject**)addr);
3095 return 1;
3096 }
3097 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003098 if (PyUnicode_READY(arg))
3099 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003100 output = arg;
3101 Py_INCREF(output);
3102 }
3103 else {
3104 arg = PyBytes_FromObject(arg);
3105 if (!arg)
3106 return 0;
3107 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3108 PyBytes_GET_SIZE(arg));
3109 Py_DECREF(arg);
3110 if (!output)
3111 return 0;
3112 if (!PyUnicode_Check(output)) {
3113 Py_DECREF(output);
3114 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3115 return 0;
3116 }
3117 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003118 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3119 PyUnicode_GET_LENGTH(output), 0, 1)) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003120 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3121 Py_DECREF(output);
3122 return 0;
3123 }
3124 *(PyObject**)addr = output;
3125 return Py_CLEANUP_SUPPORTED;
3126}
3127
3128
Martin v. Löwis5b222132007-06-10 09:51:05 +00003129char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003130PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003131{
Christian Heimesf3863112007-11-22 07:46:41 +00003132 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003133 PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3134
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003135 if (!PyUnicode_Check(unicode)) {
3136 PyErr_BadArgument();
3137 return NULL;
3138 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003139 if (PyUnicode_READY(u) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003140 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003142 if (PyUnicode_UTF8(unicode) == NULL) {
3143 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3145 if (bytes == NULL)
3146 return NULL;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003147 _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3148 if (_PyUnicode_UTF8(u) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003149 Py_DECREF(bytes);
3150 return NULL;
3151 }
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003152 _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3153 Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003154 Py_DECREF(bytes);
3155 }
3156
3157 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003158 *psize = PyUnicode_UTF8_LENGTH(unicode);
3159 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003160}
3161
3162char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003164{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003165 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3166}
3167
3168#ifdef Py_DEBUG
3169int unicode_as_unicode_calls = 0;
3170#endif
3171
3172
3173Py_UNICODE *
3174PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3175{
3176 PyUnicodeObject *u;
3177 const unsigned char *one_byte;
3178#if SIZEOF_WCHAR_T == 4
3179 const Py_UCS2 *two_bytes;
3180#else
3181 const Py_UCS4 *four_bytes;
3182 const Py_UCS4 *ucs4_end;
3183 Py_ssize_t num_surrogates;
3184#endif
3185 wchar_t *w;
3186 wchar_t *wchar_end;
3187
3188 if (!PyUnicode_Check(unicode)) {
3189 PyErr_BadArgument();
3190 return NULL;
3191 }
3192 u = (PyUnicodeObject*)unicode;
3193 if (_PyUnicode_WSTR(u) == NULL) {
3194 /* Non-ASCII compact unicode object */
3195 assert(_PyUnicode_KIND(u) != 0);
3196 assert(PyUnicode_IS_READY(u));
3197
3198#ifdef Py_DEBUG
3199 ++unicode_as_unicode_calls;
3200#endif
3201
3202 if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3203#if SIZEOF_WCHAR_T == 2
3204 four_bytes = PyUnicode_4BYTE_DATA(u);
3205 ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3206 num_surrogates = 0;
3207
3208 for (; four_bytes < ucs4_end; ++four_bytes) {
3209 if (*four_bytes > 0xFFFF)
3210 ++num_surrogates;
3211 }
3212
3213 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3214 sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3215 if (!_PyUnicode_WSTR(u)) {
3216 PyErr_NoMemory();
3217 return NULL;
3218 }
3219 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3220
3221 w = _PyUnicode_WSTR(u);
3222 wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3223 four_bytes = PyUnicode_4BYTE_DATA(u);
3224 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3225 if (*four_bytes > 0xFFFF) {
3226 /* encode surrogate pair in this case */
3227 *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3228 *w = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3229 }
3230 else
3231 *w = *four_bytes;
3232
3233 if (w > wchar_end) {
3234 assert(0 && "Miscalculated string end");
3235 }
3236 }
3237 *w = 0;
3238#else
3239 /* sizeof(wchar_t) == 4 */
3240 Py_FatalError("Impossible unicode object state, wstr and str "
3241 "should share memory already.");
3242 return NULL;
3243#endif
3244 }
3245 else {
3246 _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3247 (_PyUnicode_LENGTH(u) + 1));
3248 if (!_PyUnicode_WSTR(u)) {
3249 PyErr_NoMemory();
3250 return NULL;
3251 }
3252 if (!PyUnicode_IS_COMPACT_ASCII(u))
3253 _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3254 w = _PyUnicode_WSTR(u);
3255 wchar_end = w + _PyUnicode_LENGTH(u);
3256
3257 if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3258 one_byte = PyUnicode_1BYTE_DATA(u);
3259 for (; w < wchar_end; ++one_byte, ++w)
3260 *w = *one_byte;
3261 /* null-terminate the wstr */
3262 *w = 0;
3263 }
3264 else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3265#if SIZEOF_WCHAR_T == 4
3266 two_bytes = PyUnicode_2BYTE_DATA(u);
3267 for (; w < wchar_end; ++two_bytes, ++w)
3268 *w = *two_bytes;
3269 /* null-terminate the wstr */
3270 *w = 0;
3271#else
3272 /* sizeof(wchar_t) == 2 */
3273 PyObject_FREE(_PyUnicode_WSTR(u));
3274 _PyUnicode_WSTR(u) = NULL;
3275 Py_FatalError("Impossible unicode object state, wstr "
3276 "and str should share memory already.");
3277 return NULL;
3278#endif
3279 }
3280 else {
3281 assert(0 && "This should never happen.");
3282 }
3283 }
3284 }
3285 if (size != NULL)
3286 *size = PyUnicode_WSTR_LENGTH(u);
3287 return _PyUnicode_WSTR(u);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003288}
3289
Alexander Belopolsky40018472011-02-26 01:02:56 +00003290Py_UNICODE *
3291PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003293 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003294}
3295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003296
Alexander Belopolsky40018472011-02-26 01:02:56 +00003297Py_ssize_t
3298PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003299{
3300 if (!PyUnicode_Check(unicode)) {
3301 PyErr_BadArgument();
3302 goto onError;
3303 }
3304 return PyUnicode_GET_SIZE(unicode);
3305
Benjamin Peterson29060642009-01-31 22:14:21 +00003306 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307 return -1;
3308}
3309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003310Py_ssize_t
3311PyUnicode_GetLength(PyObject *unicode)
3312{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003313 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003314 PyErr_BadArgument();
3315 return -1;
3316 }
3317
3318 return PyUnicode_GET_LENGTH(unicode);
3319}
3320
3321Py_UCS4
3322PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3323{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003324 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3325 PyErr_BadArgument();
3326 return (Py_UCS4)-1;
3327 }
3328 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3329 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003330 return (Py_UCS4)-1;
3331 }
3332 return PyUnicode_READ_CHAR(unicode, index);
3333}
3334
3335int
3336PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3337{
3338 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003339 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003340 return -1;
3341 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003342 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3343 PyErr_SetString(PyExc_IndexError, "string index out of range");
3344 return -1;
3345 }
3346 if (_PyUnicode_Dirty(unicode))
3347 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3349 index, ch);
3350 return 0;
3351}
3352
Alexander Belopolsky40018472011-02-26 01:02:56 +00003353const char *
3354PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003355{
Victor Stinner42cb4622010-09-01 19:39:01 +00003356 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003357}
3358
Victor Stinner554f3f02010-06-16 23:33:54 +00003359/* create or adjust a UnicodeDecodeError */
3360static void
3361make_decode_exception(PyObject **exceptionObject,
3362 const char *encoding,
3363 const char *input, Py_ssize_t length,
3364 Py_ssize_t startpos, Py_ssize_t endpos,
3365 const char *reason)
3366{
3367 if (*exceptionObject == NULL) {
3368 *exceptionObject = PyUnicodeDecodeError_Create(
3369 encoding, input, length, startpos, endpos, reason);
3370 }
3371 else {
3372 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3373 goto onError;
3374 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3375 goto onError;
3376 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3377 goto onError;
3378 }
3379 return;
3380
3381onError:
3382 Py_DECREF(*exceptionObject);
3383 *exceptionObject = NULL;
3384}
3385
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386/* error handling callback helper:
3387 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003388 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 and adjust various state variables.
3390 return 0 on success, -1 on error
3391*/
3392
Alexander Belopolsky40018472011-02-26 01:02:56 +00003393static int
3394unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003395 const char *encoding, const char *reason,
3396 const char **input, const char **inend, Py_ssize_t *startinpos,
3397 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3398 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003399{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003400 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003401
3402 PyObject *restuple = NULL;
3403 PyObject *repunicode = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003404 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003405 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003406 Py_ssize_t requiredsize;
3407 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003408 const Py_UNICODE *repptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003409 PyObject *inputobj = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003410 Py_ssize_t repsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 int res = -1;
3412
3413 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003414 *errorHandler = PyCodec_LookupError(errors);
3415 if (*errorHandler == NULL)
3416 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 }
3418
Victor Stinner554f3f02010-06-16 23:33:54 +00003419 make_decode_exception(exceptionObject,
3420 encoding,
3421 *input, *inend - *input,
3422 *startinpos, *endinpos,
3423 reason);
3424 if (*exceptionObject == NULL)
3425 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426
3427 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3428 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003431 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003433 }
3434 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003435 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003436
3437 /* Copy back the bytes variables, which might have been modified by the
3438 callback */
3439 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3440 if (!inputobj)
3441 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003442 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003444 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003445 *input = PyBytes_AS_STRING(inputobj);
3446 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003447 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003448 /* we can DECREF safely, as the exception has another reference,
3449 so the object won't go away. */
3450 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003451
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003453 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003454 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003455 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3456 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003457 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458
3459 /* need more space? (at least enough for what we
3460 have+the replacement+the rest of the string (starting
3461 at the new input position), so we won't have to check space
3462 when there are no errors in the rest of the string) */
3463 repptr = PyUnicode_AS_UNICODE(repunicode);
3464 repsize = PyUnicode_GET_SIZE(repunicode);
3465 requiredsize = *outpos + repsize + insize-newpos;
3466 if (requiredsize > outsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 if (requiredsize<2*outsize)
3468 requiredsize = 2*outsize;
Victor Stinnerfe226c02011-10-03 03:52:20 +02003469 if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 goto onError;
3471 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472 }
3473 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003474 *inptr = *input + newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 Py_UNICODE_COPY(*outptr, repptr, repsize);
3476 *outptr += repsize;
3477 *outpos += repsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003478
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 /* we made it! */
3480 res = 0;
3481
Benjamin Peterson29060642009-01-31 22:14:21 +00003482 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003483 Py_XDECREF(restuple);
3484 return res;
3485}
3486
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003487/* --- UTF-7 Codec -------------------------------------------------------- */
3488
Antoine Pitrou244651a2009-05-04 18:56:13 +00003489/* See RFC2152 for details. We encode conservatively and decode liberally. */
3490
3491/* Three simple macros defining base-64. */
3492
3493/* Is c a base-64 character? */
3494
3495#define IS_BASE64(c) \
3496 (((c) >= 'A' && (c) <= 'Z') || \
3497 ((c) >= 'a' && (c) <= 'z') || \
3498 ((c) >= '0' && (c) <= '9') || \
3499 (c) == '+' || (c) == '/')
3500
3501/* given that c is a base-64 character, what is its base-64 value? */
3502
3503#define FROM_BASE64(c) \
3504 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3505 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3506 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3507 (c) == '+' ? 62 : 63)
3508
3509/* What is the base-64 character of the bottom 6 bits of n? */
3510
3511#define TO_BASE64(n) \
3512 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3513
3514/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3515 * decoded as itself. We are permissive on decoding; the only ASCII
3516 * byte not decoding to itself is the + which begins a base64
3517 * string. */
3518
3519#define DECODE_DIRECT(c) \
3520 ((c) <= 127 && (c) != '+')
3521
3522/* The UTF-7 encoder treats ASCII characters differently according to
3523 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3524 * the above). See RFC2152. This array identifies these different
3525 * sets:
3526 * 0 : "Set D"
3527 * alphanumeric and '(),-./:?
3528 * 1 : "Set O"
3529 * !"#$%&*;<=>@[]^_`{|}
3530 * 2 : "whitespace"
3531 * ht nl cr sp
3532 * 3 : special (must be base64 encoded)
3533 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3534 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003535
Tim Petersced69f82003-09-16 20:30:58 +00003536static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003537char utf7_category[128] = {
3538/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3539 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3540/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3541 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3542/* sp ! " # $ % & ' ( ) * + , - . / */
3543 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3544/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3546/* @ A B C D E F G H I J K L M N O */
3547 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3548/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3550/* ` a b c d e f g h i j k l m n o */
3551 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3552/* p q r s t u v w x y z { | } ~ del */
3553 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003554};
3555
Antoine Pitrou244651a2009-05-04 18:56:13 +00003556/* ENCODE_DIRECT: this character should be encoded as itself. The
3557 * answer depends on whether we are encoding set O as itself, and also
3558 * on whether we are encoding whitespace as itself. RFC2152 makes it
3559 * clear that the answers to these questions vary between
3560 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003561
Antoine Pitrou244651a2009-05-04 18:56:13 +00003562#define ENCODE_DIRECT(c, directO, directWS) \
3563 ((c) < 128 && (c) > 0 && \
3564 ((utf7_category[(c)] == 0) || \
3565 (directWS && (utf7_category[(c)] == 2)) || \
3566 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003567
Alexander Belopolsky40018472011-02-26 01:02:56 +00003568PyObject *
3569PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003570 Py_ssize_t size,
3571 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003572{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003573 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3574}
3575
Antoine Pitrou244651a2009-05-04 18:56:13 +00003576/* The decoder. The only state we preserve is our read position,
3577 * i.e. how many characters we have consumed. So if we end in the
3578 * middle of a shift sequence we have to back off the read position
3579 * and the output to the beginning of the sequence, otherwise we lose
3580 * all the shift state (seen bits, number of bits seen, high
3581 * surrogate). */
3582
Alexander Belopolsky40018472011-02-26 01:02:56 +00003583PyObject *
3584PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003585 Py_ssize_t size,
3586 const char *errors,
3587 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003588{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003589 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003590 Py_ssize_t startinpos;
3591 Py_ssize_t endinpos;
3592 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003593 const char *e;
3594 PyUnicodeObject *unicode;
3595 Py_UNICODE *p;
3596 const char *errmsg = "";
3597 int inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003598 Py_UNICODE *shiftOutStart;
3599 unsigned int base64bits = 0;
3600 unsigned long base64buffer = 0;
3601 Py_UNICODE surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003602 PyObject *errorHandler = NULL;
3603 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003604
3605 unicode = _PyUnicode_New(size);
3606 if (!unicode)
3607 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003608 if (size == 0) {
3609 if (consumed)
3610 *consumed = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003611 return (PyObject *)unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003612 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003614 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitrou244651a2009-05-04 18:56:13 +00003615 shiftOutStart = p;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003616 e = s + size;
3617
3618 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003619 Py_UNICODE ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003620 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003621 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003622
Antoine Pitrou244651a2009-05-04 18:56:13 +00003623 if (inShift) { /* in a base-64 section */
3624 if (IS_BASE64(ch)) { /* consume a base-64 character */
3625 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3626 base64bits += 6;
3627 s++;
3628 if (base64bits >= 16) {
3629 /* we have enough bits for a UTF-16 value */
3630 Py_UNICODE outCh = (Py_UNICODE)
3631 (base64buffer >> (base64bits-16));
3632 base64bits -= 16;
3633 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3634 if (surrogate) {
3635 /* expecting a second surrogate */
3636 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3637#ifdef Py_UNICODE_WIDE
3638 *p++ = (((surrogate & 0x3FF)<<10)
3639 | (outCh & 0x3FF)) + 0x10000;
3640#else
3641 *p++ = surrogate;
3642 *p++ = outCh;
3643#endif
3644 surrogate = 0;
3645 }
3646 else {
3647 surrogate = 0;
3648 errmsg = "second surrogate missing";
3649 goto utf7Error;
3650 }
3651 }
3652 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3653 /* first surrogate */
3654 surrogate = outCh;
3655 }
3656 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3657 errmsg = "unexpected second surrogate";
3658 goto utf7Error;
3659 }
3660 else {
3661 *p++ = outCh;
3662 }
3663 }
3664 }
3665 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003666 inShift = 0;
3667 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003668 if (surrogate) {
3669 errmsg = "second surrogate missing at end of shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +00003670 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003671 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003672 if (base64bits > 0) { /* left-over bits */
3673 if (base64bits >= 6) {
3674 /* We've seen at least one base-64 character */
3675 errmsg = "partial character in shift sequence";
3676 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003677 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003678 else {
3679 /* Some bits remain; they should be zero */
3680 if (base64buffer != 0) {
3681 errmsg = "non-zero padding bits in shift sequence";
3682 goto utf7Error;
3683 }
3684 }
3685 }
3686 if (ch != '-') {
3687 /* '-' is absorbed; other terminating
3688 characters are preserved */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003689 *p++ = ch;
3690 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003691 }
3692 }
3693 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003695 s++; /* consume '+' */
3696 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003697 s++;
3698 *p++ = '+';
Antoine Pitrou244651a2009-05-04 18:56:13 +00003699 }
3700 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003701 inShift = 1;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003702 shiftOutStart = p;
3703 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003704 }
3705 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003706 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003707 *p++ = ch;
3708 s++;
3709 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003710 else {
3711 startinpos = s-starts;
3712 s++;
3713 errmsg = "unexpected special character";
3714 goto utf7Error;
3715 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003716 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003717utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003718 outpos = p-PyUnicode_AS_UNICODE(unicode);
3719 endinpos = s-starts;
3720 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00003721 errors, &errorHandler,
3722 "utf7", errmsg,
3723 &starts, &e, &startinpos, &endinpos, &exc, &s,
3724 &unicode, &outpos, &p))
3725 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003726 }
3727
Antoine Pitrou244651a2009-05-04 18:56:13 +00003728 /* end of string */
3729
3730 if (inShift && !consumed) { /* in shift sequence, no more to follow */
3731 /* if we're in an inconsistent state, that's an error */
3732 if (surrogate ||
3733 (base64bits >= 6) ||
3734 (base64bits > 0 && base64buffer != 0)) {
3735 outpos = p-PyUnicode_AS_UNICODE(unicode);
3736 endinpos = size;
3737 if (unicode_decode_call_errorhandler(
3738 errors, &errorHandler,
3739 "utf7", "unterminated shift sequence",
3740 &starts, &e, &startinpos, &endinpos, &exc, &s,
3741 &unicode, &outpos, &p))
3742 goto onError;
3743 if (s < e)
3744 goto restart;
3745 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003746 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003747
3748 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003749 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003750 if (inShift) {
3751 p = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003752 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003753 }
3754 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003755 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003756 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003757 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003758
Victor Stinnerfe226c02011-10-03 03:52:20 +02003759 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003760 goto onError;
3761
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003762 Py_XDECREF(errorHandler);
3763 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02003764#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02003765 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003766 Py_DECREF(unicode);
3767 return NULL;
3768 }
Victor Stinner17efeed2011-10-04 20:05:46 +02003769#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02003770 assert(_PyUnicode_CheckConsistency(unicode, 1));
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771 return (PyObject *)unicode;
3772
Benjamin Peterson29060642009-01-31 22:14:21 +00003773 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003774 Py_XDECREF(errorHandler);
3775 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003776 Py_DECREF(unicode);
3777 return NULL;
3778}
3779
3780
Alexander Belopolsky40018472011-02-26 01:02:56 +00003781PyObject *
3782PyUnicode_EncodeUTF7(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003783 Py_ssize_t size,
3784 int base64SetO,
3785 int base64WhiteSpace,
3786 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003787{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003788 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003789 /* It might be possible to tighten this worst case */
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003790 Py_ssize_t allocated = 8 * size;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003791 int inShift = 0;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003792 Py_ssize_t i = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003793 unsigned int base64bits = 0;
3794 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003795 char * out;
3796 char * start;
3797
3798 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003799 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003800
Alexandre Vassalottie85bd982009-07-21 00:39:03 +00003801 if (allocated / 8 != size)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00003802 return PyErr_NoMemory();
3803
Antoine Pitrou244651a2009-05-04 18:56:13 +00003804 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003805 if (v == NULL)
3806 return NULL;
3807
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003808 start = out = PyBytes_AS_STRING(v);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003809 for (;i < size; ++i) {
3810 Py_UNICODE ch = s[i];
3811
Antoine Pitrou244651a2009-05-04 18:56:13 +00003812 if (inShift) {
3813 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3814 /* shifting out */
3815 if (base64bits) { /* output remaining bits */
3816 *out++ = TO_BASE64(base64buffer << (6-base64bits));
3817 base64buffer = 0;
3818 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819 }
3820 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003821 /* Characters not in the BASE64 set implicitly unshift the sequence
3822 so no '-' is required, except if the character is itself a '-' */
3823 if (IS_BASE64(ch) || ch == '-') {
3824 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003825 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003826 *out++ = (char) ch;
3827 }
3828 else {
3829 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00003830 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003831 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003832 else { /* not in a shift sequence */
3833 if (ch == '+') {
3834 *out++ = '+';
3835 *out++ = '-';
3836 }
3837 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3838 *out++ = (char) ch;
3839 }
3840 else {
3841 *out++ = '+';
3842 inShift = 1;
3843 goto encode_char;
3844 }
3845 }
3846 continue;
3847encode_char:
3848#ifdef Py_UNICODE_WIDE
3849 if (ch >= 0x10000) {
3850 /* code first surrogate */
3851 base64bits += 16;
3852 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3853 while (base64bits >= 6) {
3854 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3855 base64bits -= 6;
3856 }
3857 /* prepare second surrogate */
3858 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
3859 }
3860#endif
3861 base64bits += 16;
3862 base64buffer = (base64buffer << 16) | ch;
3863 while (base64bits >= 6) {
3864 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3865 base64bits -= 6;
3866 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00003867 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003868 if (base64bits)
3869 *out++= TO_BASE64(base64buffer << (6-base64bits) );
3870 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003871 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00003872 if (_PyBytes_Resize(&v, out - start) < 0)
3873 return NULL;
3874 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003875}
3876
Antoine Pitrou244651a2009-05-04 18:56:13 +00003877#undef IS_BASE64
3878#undef FROM_BASE64
3879#undef TO_BASE64
3880#undef DECODE_DIRECT
3881#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003882
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883/* --- UTF-8 Codec -------------------------------------------------------- */
3884
Tim Petersced69f82003-09-16 20:30:58 +00003885static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00003887 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
3888 illegal prefix. See RFC 3629 for details */
3889 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3890 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003891 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3893 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3894 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3895 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00003896 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3897 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00003900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3901 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3902 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3903 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3904 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905};
3906
Alexander Belopolsky40018472011-02-26 01:02:56 +00003907PyObject *
3908PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003909 Py_ssize_t size,
3910 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911{
Walter Dörwald69652032004-09-07 20:24:22 +00003912 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3913}
3914
Antoine Pitrouab868312009-01-10 15:40:25 +00003915/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3916#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3917
3918/* Mask to quickly check whether a C 'long' contains a
3919 non-ASCII, UTF8-encoded char. */
3920#if (SIZEOF_LONG == 8)
3921# define ASCII_CHAR_MASK 0x8080808080808080L
3922#elif (SIZEOF_LONG == 4)
3923# define ASCII_CHAR_MASK 0x80808080L
3924#else
3925# error C 'long' size should be either 4 or 8!
3926#endif
3927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003928/* Scans a UTF-8 string and returns the maximum character to be expected,
3929 the size of the decoded unicode string and if any major errors were
3930 encountered.
3931
3932 This function does check basic UTF-8 sanity, it does however NOT CHECK
3933 if the string contains surrogates, and if all continuation bytes are
3934 within the correct ranges, these checks are performed in
3935 PyUnicode_DecodeUTF8Stateful.
3936
3937 If it sets has_errors to 1, it means the value of unicode_size and max_char
3938 will be bogus and you should not rely on useful information in them.
3939 */
3940static Py_UCS4
3941utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3942 Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3943 int *has_errors)
3944{
3945 Py_ssize_t n;
3946 Py_ssize_t char_count = 0;
3947 Py_UCS4 max_char = 127, new_max;
3948 Py_UCS4 upper_bound;
3949 const unsigned char *p = (const unsigned char *)s;
3950 const unsigned char *end = p + string_size;
3951 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3952 int err = 0;
3953
3954 for (; p < end && !err; ++p, ++char_count) {
3955 /* Only check value if it's not a ASCII char... */
3956 if (*p < 0x80) {
3957 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3958 an explanation. */
3959 if (!((size_t) p & LONG_PTR_MASK)) {
3960 /* Help register allocation */
3961 register const unsigned char *_p = p;
3962 while (_p < aligned_end) {
3963 unsigned long value = *(unsigned long *) _p;
3964 if (value & ASCII_CHAR_MASK)
3965 break;
3966 _p += SIZEOF_LONG;
3967 char_count += SIZEOF_LONG;
3968 }
3969 p = _p;
3970 if (p == end)
3971 break;
3972 }
3973 }
3974 if (*p >= 0x80) {
3975 n = utf8_code_length[*p];
3976 new_max = max_char;
3977 switch (n) {
3978 /* invalid start byte */
3979 case 0:
3980 err = 1;
3981 break;
3982 case 2:
3983 /* Code points between 0x00FF and 0x07FF inclusive.
3984 Approximate the upper bound of the code point,
3985 if this flips over 255 we can be sure it will be more
3986 than 255 and the string will need 2 bytes per code coint,
3987 if it stays under or equal to 255, we can be sure 1 byte
3988 is enough.
3989 ((*p & 0b00011111) << 6) | 0b00111111 */
3990 upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3991 if (max_char < upper_bound)
3992 new_max = upper_bound;
3993 /* Ensure we track at least that we left ASCII space. */
3994 if (new_max < 128)
3995 new_max = 128;
3996 break;
3997 case 3:
3998 /* Between 0x0FFF and 0xFFFF inclusive, so values are
3999 always > 255 and <= 65535 and will always need 2 bytes. */
4000 if (max_char < 65535)
4001 new_max = 65535;
4002 break;
4003 case 4:
4004 /* Code point will be above 0xFFFF for sure in this case. */
4005 new_max = 65537;
4006 break;
4007 /* Internal error, this should be caught by the first if */
4008 case 1:
4009 default:
4010 assert(0 && "Impossible case in utf8_max_char_and_size");
4011 err = 1;
4012 }
4013 /* Instead of number of overall bytes for this code point,
Georg Brandl7597add2011-10-05 16:36:47 +02004014 n contains the number of following bytes: */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004015 --n;
4016 /* Check if the follow up chars are all valid continuation bytes */
4017 if (n >= 1) {
4018 const unsigned char *cont;
4019 if ((p + n) >= end) {
4020 if (consumed == 0)
4021 /* incomplete data, non-incremental decoding */
4022 err = 1;
4023 break;
4024 }
4025 for (cont = p + 1; cont < (p + n); ++cont) {
4026 if ((*cont & 0xc0) != 0x80) {
4027 err = 1;
4028 break;
4029 }
4030 }
4031 p += n;
4032 }
4033 else
4034 err = 1;
4035 max_char = new_max;
4036 }
4037 }
4038
4039 if (unicode_size)
4040 *unicode_size = char_count;
4041 if (has_errors)
4042 *has_errors = err;
4043 return max_char;
4044}
4045
4046/* Similar to PyUnicode_WRITE but can also write into wstr field
4047 of the legacy unicode representation */
4048#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4049 do { \
4050 const int k_ = (kind); \
4051 if (k_ == PyUnicode_WCHAR_KIND) \
4052 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4053 else if (k_ == PyUnicode_1BYTE_KIND) \
4054 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4055 else if (k_ == PyUnicode_2BYTE_KIND) \
4056 ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4057 else \
4058 ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4059 } while (0)
4060
Alexander Belopolsky40018472011-02-26 01:02:56 +00004061PyObject *
4062PyUnicode_DecodeUTF8Stateful(const char *s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004063 Py_ssize_t size,
4064 const char *errors,
4065 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004066{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004067 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004069 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004070 Py_ssize_t startinpos;
4071 Py_ssize_t endinpos;
Antoine Pitrouab868312009-01-10 15:40:25 +00004072 const char *e, *aligned_end;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 PyUnicodeObject *unicode;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004074 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004075 PyObject *errorHandler = NULL;
4076 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004077 Py_UCS4 maxchar = 0;
4078 Py_ssize_t unicode_size;
4079 Py_ssize_t i;
4080 int kind;
4081 void *data;
4082 int has_errors;
4083 Py_UNICODE *error_outptr;
4084#if SIZEOF_WCHAR_T == 2
4085 Py_ssize_t wchar_offset = 0;
4086#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004087
Walter Dörwald69652032004-09-07 20:24:22 +00004088 if (size == 0) {
4089 if (consumed)
4090 *consumed = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004091 return (PyObject *)PyUnicode_New(0, 0);
Walter Dörwald69652032004-09-07 20:24:22 +00004092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4094 consumed, &has_errors);
4095 if (has_errors) {
4096 unicode = _PyUnicode_New(size);
4097 if (!unicode)
4098 return NULL;
4099 kind = PyUnicode_WCHAR_KIND;
4100 data = PyUnicode_AS_UNICODE(unicode);
4101 assert(data != NULL);
4102 }
4103 else {
4104 unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4105 if (!unicode)
4106 return NULL;
4107 /* When the string is ASCII only, just use memcpy and return.
4108 unicode_size may be != size if there is an incomplete UTF-8
4109 sequence at the end of the ASCII block. */
4110 if (maxchar < 128 && size == unicode_size) {
4111 Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4112 return (PyObject *)unicode;
4113 }
4114 kind = PyUnicode_KIND(unicode);
4115 data = PyUnicode_DATA(unicode);
4116 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 /* Unpack UTF-8 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004118 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 e = s + size;
Antoine Pitrouab868312009-01-10 15:40:25 +00004120 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121
4122 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004123 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124
4125 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004126 /* Fast path for runs of ASCII characters. Given that common UTF-8
4127 input will consist of an overwhelming majority of ASCII
4128 characters, we try to optimize for this case by checking
4129 as many characters as a C 'long' can contain.
4130 First, check if we can do an aligned read, as most CPUs have
4131 a penalty for unaligned reads.
4132 */
4133 if (!((size_t) s & LONG_PTR_MASK)) {
4134 /* Help register allocation */
4135 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004136 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004137 while (_s < aligned_end) {
4138 /* Read a whole long at a time (either 4 or 8 bytes),
4139 and do a fast unrolled copy if it only contains ASCII
4140 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004141 unsigned long value = *(unsigned long *) _s;
4142 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004143 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004144 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4145 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4146 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4147 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004148#if (SIZEOF_LONG == 8)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004149 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4150 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4151 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4152 WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004153#endif
4154 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004156 }
4157 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004158 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004159 if (s == e)
4160 break;
4161 ch = (unsigned char)*s;
4162 }
4163 }
4164
4165 if (ch < 0x80) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004166 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004167 s++;
4168 continue;
4169 }
4170
4171 n = utf8_code_length[ch];
4172
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004173 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 if (consumed)
4175 break;
4176 else {
4177 errmsg = "unexpected end of data";
4178 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004179 endinpos = startinpos+1;
4180 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4181 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004182 goto utf8Error;
4183 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185
4186 switch (n) {
4187
4188 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004189 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004190 startinpos = s-starts;
4191 endinpos = startinpos+1;
4192 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193
4194 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004195 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004196 startinpos = s-starts;
4197 endinpos = startinpos+1;
4198 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199
4200 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004201 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004202 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004203 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004204 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004205 goto utf8Error;
4206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004208 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004209 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 break;
4211
4212 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004213 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4214 will result in surrogates in range d800-dfff. Surrogates are
4215 not valid UTF-8 so they are rejected.
4216 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4217 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004218 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004219 (s[2] & 0xc0) != 0x80 ||
4220 ((unsigned char)s[0] == 0xE0 &&
4221 (unsigned char)s[1] < 0xA0) ||
4222 ((unsigned char)s[0] == 0xED &&
4223 (unsigned char)s[1] > 0x9F)) {
4224 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004225 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004226 endinpos = startinpos + 1;
4227
4228 /* if s[1] first two bits are 1 and 0, then the invalid
4229 continuation byte is s[2], so increment endinpos by 1,
4230 if not, s[1] is invalid and endinpos doesn't need to
4231 be incremented. */
4232 if ((s[1] & 0xC0) == 0x80)
4233 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004234 goto utf8Error;
4235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004237 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004239 break;
4240
4241 case 4:
4242 if ((s[1] & 0xc0) != 0x80 ||
4243 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004244 (s[3] & 0xc0) != 0x80 ||
4245 ((unsigned char)s[0] == 0xF0 &&
4246 (unsigned char)s[1] < 0x90) ||
4247 ((unsigned char)s[0] == 0xF4 &&
4248 (unsigned char)s[1] > 0x8F)) {
4249 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004250 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004251 endinpos = startinpos + 1;
4252 if ((s[1] & 0xC0) == 0x80) {
4253 endinpos++;
4254 if ((s[2] & 0xC0) == 0x80)
4255 endinpos++;
4256 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004257 goto utf8Error;
4258 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004259 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004260 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4261 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4262
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263 /* If the string is flexible or we have native UCS-4, write
4264 directly.. */
4265 if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4266 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
Tim Petersced69f82003-09-16 20:30:58 +00004267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 else {
4269 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00004270
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 /* translate from 10000..10FFFF to 0..FFFF */
4272 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00004273
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004274 /* high surrogate = top 10 bits added to D800 */
4275 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4276 (Py_UNICODE)(0xD800 + (ch >> 10)));
4277
4278 /* low surrogate = bottom 10 bits added to DC00 */
4279 WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4280 (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4281 }
4282#if SIZEOF_WCHAR_T == 2
4283 wchar_offset++;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00004284#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 }
4287 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004288 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004289
Benjamin Peterson29060642009-01-31 22:14:21 +00004290 utf8Error:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004291 /* If this is not yet a resizable string, make it one.. */
4292 if (kind != PyUnicode_WCHAR_KIND) {
4293 const Py_UNICODE *u;
4294 PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4295 if (!new_unicode)
4296 goto onError;
4297 u = PyUnicode_AsUnicode((PyObject *)unicode);
4298 if (!u)
4299 goto onError;
4300#if SIZEOF_WCHAR_T == 2
4301 i += wchar_offset;
4302#endif
4303 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4304 Py_DECREF(unicode);
4305 unicode = new_unicode;
4306 kind = 0;
4307 data = PyUnicode_AS_UNICODE(new_unicode);
4308 assert(data != NULL);
4309 }
4310 error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 if (unicode_decode_call_errorhandler(
4312 errors, &errorHandler,
4313 "utf8", errmsg,
4314 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004315 &unicode, &i, &error_outptr))
Benjamin Peterson29060642009-01-31 22:14:21 +00004316 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004317 /* Update data because unicode_decode_call_errorhandler might have
4318 re-created or resized the unicode object. */
4319 data = PyUnicode_AS_UNICODE(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322 /* Ensure the unicode_size calculation above was correct: */
4323 assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4324
Walter Dörwald69652032004-09-07 20:24:22 +00004325 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 /* Adjust length and ready string when it contained errors and
4329 is of the old resizable kind. */
4330 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004331 if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004332 goto onError;
4333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004335 Py_XDECREF(errorHandler);
4336 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004337#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004338 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 Py_DECREF(unicode);
4340 return NULL;
4341 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004342#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004343 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 return (PyObject *)unicode;
4345
Benjamin Peterson29060642009-01-31 22:14:21 +00004346 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 Py_XDECREF(errorHandler);
4348 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 Py_DECREF(unicode);
4350 return NULL;
4351}
4352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004353#undef WRITE_FLEXIBLE_OR_WSTR
Antoine Pitrouab868312009-01-10 15:40:25 +00004354
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004355#ifdef __APPLE__
4356
4357/* Simplified UTF-8 decoder using surrogateescape error handler,
4358 used to decode the command line arguments on Mac OS X. */
4359
4360wchar_t*
4361_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4362{
4363 int n;
4364 const char *e;
4365 wchar_t *unicode, *p;
4366
4367 /* Note: size will always be longer than the resulting Unicode
4368 character count */
4369 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4370 PyErr_NoMemory();
4371 return NULL;
4372 }
4373 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4374 if (!unicode)
4375 return NULL;
4376
4377 /* Unpack UTF-8 encoded data */
4378 p = unicode;
4379 e = s + size;
4380 while (s < e) {
4381 Py_UCS4 ch = (unsigned char)*s;
4382
4383 if (ch < 0x80) {
4384 *p++ = (wchar_t)ch;
4385 s++;
4386 continue;
4387 }
4388
4389 n = utf8_code_length[ch];
4390 if (s + n > e) {
4391 goto surrogateescape;
4392 }
4393
4394 switch (n) {
4395 case 0:
4396 case 1:
4397 goto surrogateescape;
4398
4399 case 2:
4400 if ((s[1] & 0xc0) != 0x80)
4401 goto surrogateescape;
4402 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4403 assert ((ch > 0x007F) && (ch <= 0x07FF));
4404 *p++ = (wchar_t)ch;
4405 break;
4406
4407 case 3:
4408 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4409 will result in surrogates in range d800-dfff. Surrogates are
4410 not valid UTF-8 so they are rejected.
4411 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4412 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4413 if ((s[1] & 0xc0) != 0x80 ||
4414 (s[2] & 0xc0) != 0x80 ||
4415 ((unsigned char)s[0] == 0xE0 &&
4416 (unsigned char)s[1] < 0xA0) ||
4417 ((unsigned char)s[0] == 0xED &&
4418 (unsigned char)s[1] > 0x9F)) {
4419
4420 goto surrogateescape;
4421 }
4422 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4423 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004424 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004425 break;
4426
4427 case 4:
4428 if ((s[1] & 0xc0) != 0x80 ||
4429 (s[2] & 0xc0) != 0x80 ||
4430 (s[3] & 0xc0) != 0x80 ||
4431 ((unsigned char)s[0] == 0xF0 &&
4432 (unsigned char)s[1] < 0x90) ||
4433 ((unsigned char)s[0] == 0xF4 &&
4434 (unsigned char)s[1] > 0x8F)) {
4435 goto surrogateescape;
4436 }
4437 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4438 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4439 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4440
4441#if SIZEOF_WCHAR_T == 4
4442 *p++ = (wchar_t)ch;
4443#else
4444 /* compute and append the two surrogates: */
4445
4446 /* translate from 10000..10FFFF to 0..FFFF */
4447 ch -= 0x10000;
4448
4449 /* high surrogate = top 10 bits added to D800 */
4450 *p++ = (wchar_t)(0xD800 + (ch >> 10));
4451
4452 /* low surrogate = bottom 10 bits added to DC00 */
4453 *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4454#endif
4455 break;
4456 }
4457 s += n;
4458 continue;
4459
4460 surrogateescape:
4461 *p++ = 0xDC00 + ch;
4462 s++;
4463 }
4464 *p = L'\0';
4465 return unicode;
4466}
4467
4468#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004469
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004470/* Primary internal function which creates utf8 encoded bytes objects.
4471
4472 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004473 and allocate exactly as much space needed at the end. Else allocate the
4474 maximum possible needed (4 result bytes per Unicode character), and return
4475 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004476*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004477PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004478_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479{
Tim Peters602f7402002-04-27 18:03:26 +00004480#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004481
Guido van Rossum98297ee2007-11-06 21:34:58 +00004482 Py_ssize_t i; /* index into s of next input byte */
4483 PyObject *result; /* result string object */
4484 char *p; /* next free byte in output buffer */
4485 Py_ssize_t nallocated; /* number of result bytes allocated */
4486 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004487 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004488 PyObject *errorHandler = NULL;
4489 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004490 int kind;
4491 void *data;
4492 Py_ssize_t size;
4493 PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4494#if SIZEOF_WCHAR_T == 2
4495 Py_ssize_t wchar_offset = 0;
4496#endif
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004498 if (!PyUnicode_Check(unicode)) {
4499 PyErr_BadArgument();
4500 return NULL;
4501 }
4502
4503 if (PyUnicode_READY(unicode) == -1)
4504 return NULL;
4505
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004506 if (PyUnicode_UTF8(unicode))
4507 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4508 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004509
4510 kind = PyUnicode_KIND(unicode);
4511 data = PyUnicode_DATA(unicode);
4512 size = PyUnicode_GET_LENGTH(unicode);
4513
Tim Peters602f7402002-04-27 18:03:26 +00004514 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515
Tim Peters602f7402002-04-27 18:03:26 +00004516 if (size <= MAX_SHORT_UNICHARS) {
4517 /* Write into the stack buffer; nallocated can't overflow.
4518 * At the end, we'll allocate exactly as much heap space as it
4519 * turns out we need.
4520 */
4521 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004522 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004523 p = stackbuf;
4524 }
4525 else {
4526 /* Overallocate on the heap, and give the excess back at the end. */
4527 nallocated = size * 4;
4528 if (nallocated / 4 != size) /* overflow! */
4529 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004530 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004531 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004532 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004533 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004534 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004535
Tim Peters602f7402002-04-27 18:03:26 +00004536 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004537 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004538
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004539 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004540 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004542
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004544 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004545 *p++ = (char)(0xc0 | (ch >> 6));
4546 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner31be90b2010-04-22 19:38:16 +00004547 } else if (0xD800 <= ch && ch <= 0xDFFF) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004548 Py_ssize_t newpos;
4549 PyObject *rep;
4550 Py_ssize_t repsize, k, startpos;
4551 startpos = i-1;
4552#if SIZEOF_WCHAR_T == 2
4553 startpos += wchar_offset;
Victor Stinner445a6232010-04-22 20:01:57 +00004554#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004555 rep = unicode_encode_call_errorhandler(
4556 errors, &errorHandler, "utf-8", "surrogates not allowed",
4557 PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4558 &exc, startpos, startpos+1, &newpos);
4559 if (!rep)
4560 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004561
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004562 if (PyBytes_Check(rep))
4563 repsize = PyBytes_GET_SIZE(rep);
4564 else
4565 repsize = PyUnicode_GET_SIZE(rep);
4566
4567 if (repsize > 4) {
4568 Py_ssize_t offset;
4569
4570 if (result == NULL)
4571 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004572 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004573 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004575 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4576 /* integer overflow */
4577 PyErr_NoMemory();
4578 goto error;
4579 }
4580 nallocated += repsize - 4;
4581 if (result != NULL) {
4582 if (_PyBytes_Resize(&result, nallocated) < 0)
4583 goto error;
4584 } else {
4585 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004586 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004587 goto error;
4588 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4589 }
4590 p = PyBytes_AS_STRING(result) + offset;
4591 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004593 if (PyBytes_Check(rep)) {
4594 char *prep = PyBytes_AS_STRING(rep);
4595 for(k = repsize; k > 0; k--)
4596 *p++ = *prep++;
4597 } else /* rep is unicode */ {
4598 const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4599 Py_UNICODE c;
4600
4601 for(k=0; k<repsize; k++) {
4602 c = prep[k];
4603 if (0x80 <= c) {
4604 raise_encode_exception(&exc, "utf-8",
4605 PyUnicode_AS_UNICODE(unicode),
4606 size, i-1, i,
4607 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004608 goto error;
4609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004610 *p++ = (char)prep[k];
Victor Stinner31be90b2010-04-22 19:38:16 +00004611 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004613 Py_DECREF(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004614 } else if (ch < 0x10000) {
4615 *p++ = (char)(0xe0 | (ch >> 12));
4616 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4617 *p++ = (char)(0x80 | (ch & 0x3f));
4618 } else /* ch >= 0x10000 */ {
Tim Peters602f7402002-04-27 18:03:26 +00004619 /* Encode UCS4 Unicode ordinals */
4620 *p++ = (char)(0xf0 | (ch >> 18));
4621 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4622 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4623 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004624#if SIZEOF_WCHAR_T == 2
4625 wchar_offset++;
4626#endif
Tim Peters602f7402002-04-27 18:03:26 +00004627 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004629
Guido van Rossum98297ee2007-11-06 21:34:58 +00004630 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004631 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004632 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004633 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004634 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004635 }
4636 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004637 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004638 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004639 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004640 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004641 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004642
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004643 Py_XDECREF(errorHandler);
4644 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004645 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004646 error:
4647 Py_XDECREF(errorHandler);
4648 Py_XDECREF(exc);
4649 Py_XDECREF(result);
4650 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004651
Tim Peters602f7402002-04-27 18:03:26 +00004652#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653}
4654
Alexander Belopolsky40018472011-02-26 01:02:56 +00004655PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004656PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4657 Py_ssize_t size,
4658 const char *errors)
4659{
4660 PyObject *v, *unicode;
4661
4662 unicode = PyUnicode_FromUnicode(s, size);
4663 if (unicode == NULL)
4664 return NULL;
4665 v = _PyUnicode_AsUTF8String(unicode, errors);
4666 Py_DECREF(unicode);
4667 return v;
4668}
4669
4670PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004671PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004673 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674}
4675
Walter Dörwald41980ca2007-08-16 21:55:45 +00004676/* --- UTF-32 Codec ------------------------------------------------------- */
4677
4678PyObject *
4679PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004680 Py_ssize_t size,
4681 const char *errors,
4682 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004683{
4684 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4685}
4686
4687PyObject *
4688PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004689 Py_ssize_t size,
4690 const char *errors,
4691 int *byteorder,
4692 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004693{
4694 const char *starts = s;
4695 Py_ssize_t startinpos;
4696 Py_ssize_t endinpos;
4697 Py_ssize_t outpos;
4698 PyUnicodeObject *unicode;
4699 Py_UNICODE *p;
4700#ifndef Py_UNICODE_WIDE
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004701 int pairs = 0;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004702 const unsigned char *qq;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004703#else
4704 const int pairs = 0;
4705#endif
Mark Dickinson7db923c2010-06-12 09:10:14 +00004706 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004707 int bo = 0; /* assume native ordering by default */
4708 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004709 /* Offsets from q for retrieving bytes in the right order. */
4710#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4711 int iorder[] = {0, 1, 2, 3};
4712#else
4713 int iorder[] = {3, 2, 1, 0};
4714#endif
4715 PyObject *errorHandler = NULL;
4716 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004717
Walter Dörwald41980ca2007-08-16 21:55:45 +00004718 q = (unsigned char *)s;
4719 e = q + size;
4720
4721 if (byteorder)
4722 bo = *byteorder;
4723
4724 /* Check for BOM marks (U+FEFF) in the input and adjust current
4725 byte order setting accordingly. In native mode, the leading BOM
4726 mark is skipped, in all other modes, it is copied to the output
4727 stream as-is (giving a ZWNBSP character). */
4728 if (bo == 0) {
4729 if (size >= 4) {
4730 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004731 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004732#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004733 if (bom == 0x0000FEFF) {
4734 q += 4;
4735 bo = -1;
4736 }
4737 else if (bom == 0xFFFE0000) {
4738 q += 4;
4739 bo = 1;
4740 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004741#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004742 if (bom == 0x0000FEFF) {
4743 q += 4;
4744 bo = 1;
4745 }
4746 else if (bom == 0xFFFE0000) {
4747 q += 4;
4748 bo = -1;
4749 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004750#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004751 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004752 }
4753
4754 if (bo == -1) {
4755 /* force LE */
4756 iorder[0] = 0;
4757 iorder[1] = 1;
4758 iorder[2] = 2;
4759 iorder[3] = 3;
4760 }
4761 else if (bo == 1) {
4762 /* force BE */
4763 iorder[0] = 3;
4764 iorder[1] = 2;
4765 iorder[2] = 1;
4766 iorder[3] = 0;
4767 }
4768
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004769 /* On narrow builds we split characters outside the BMP into two
4770 codepoints => count how much extra space we need. */
4771#ifndef Py_UNICODE_WIDE
4772 for (qq = q; qq < e; qq += 4)
4773 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4774 pairs++;
4775#endif
4776
4777 /* This might be one to much, because of a BOM */
4778 unicode = _PyUnicode_New((size+3)/4+pairs);
4779 if (!unicode)
4780 return NULL;
4781 if (size == 0)
4782 return (PyObject *)unicode;
4783
4784 /* Unpack UTF-32 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004785 p = PyUnicode_AS_UNICODE(unicode);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004786
Walter Dörwald41980ca2007-08-16 21:55:45 +00004787 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004788 Py_UCS4 ch;
4789 /* remaining bytes at the end? (size should be divisible by 4) */
4790 if (e-q<4) {
4791 if (consumed)
4792 break;
4793 errmsg = "truncated data";
4794 startinpos = ((const char *)q)-starts;
4795 endinpos = ((const char *)e)-starts;
4796 goto utf32Error;
4797 /* The remaining input chars are ignored if the callback
4798 chooses to skip the input */
4799 }
4800 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4801 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004802
Benjamin Peterson29060642009-01-31 22:14:21 +00004803 if (ch >= 0x110000)
4804 {
4805 errmsg = "codepoint not in range(0x110000)";
4806 startinpos = ((const char *)q)-starts;
4807 endinpos = startinpos+4;
4808 goto utf32Error;
4809 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004810#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004811 if (ch >= 0x10000)
4812 {
4813 *p++ = 0xD800 | ((ch-0x10000) >> 10);
4814 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4815 }
4816 else
Walter Dörwald41980ca2007-08-16 21:55:45 +00004817#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004818 *p++ = ch;
4819 q += 4;
4820 continue;
4821 utf32Error:
4822 outpos = p-PyUnicode_AS_UNICODE(unicode);
4823 if (unicode_decode_call_errorhandler(
4824 errors, &errorHandler,
4825 "utf32", errmsg,
4826 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4827 &unicode, &outpos, &p))
4828 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004829 }
4830
4831 if (byteorder)
4832 *byteorder = bo;
4833
4834 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004835 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004836
4837 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02004838 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004839 goto onError;
4840
4841 Py_XDECREF(errorHandler);
4842 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02004843#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02004844 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 Py_DECREF(unicode);
4846 return NULL;
4847 }
Victor Stinner17efeed2011-10-04 20:05:46 +02004848#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004849 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald41980ca2007-08-16 21:55:45 +00004850 return (PyObject *)unicode;
4851
Benjamin Peterson29060642009-01-31 22:14:21 +00004852 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00004853 Py_DECREF(unicode);
4854 Py_XDECREF(errorHandler);
4855 Py_XDECREF(exc);
4856 return NULL;
4857}
4858
4859PyObject *
4860PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004861 Py_ssize_t size,
4862 const char *errors,
4863 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004864{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004865 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004866 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004867 Py_ssize_t nsize, bytesize;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004868#ifndef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004869 Py_ssize_t i, pairs;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004870#else
4871 const int pairs = 0;
4872#endif
4873 /* Offsets from p for storing byte pairs in the right order. */
4874#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4875 int iorder[] = {0, 1, 2, 3};
4876#else
4877 int iorder[] = {3, 2, 1, 0};
4878#endif
4879
Benjamin Peterson29060642009-01-31 22:14:21 +00004880#define STORECHAR(CH) \
4881 do { \
4882 p[iorder[3]] = ((CH) >> 24) & 0xff; \
4883 p[iorder[2]] = ((CH) >> 16) & 0xff; \
4884 p[iorder[1]] = ((CH) >> 8) & 0xff; \
4885 p[iorder[0]] = (CH) & 0xff; \
4886 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00004887 } while(0)
4888
4889 /* In narrow builds we can output surrogate pairs as one codepoint,
4890 so we need less space. */
4891#ifndef Py_UNICODE_WIDE
4892 for (i = pairs = 0; i < size-1; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00004893 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4894 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4895 pairs++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004896#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004897 nsize = (size - pairs + (byteorder == 0));
4898 bytesize = nsize * 4;
4899 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004901 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902 if (v == NULL)
4903 return NULL;
4904
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004905 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004907 STORECHAR(0xFEFF);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004908 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00004909 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004910
4911 if (byteorder == -1) {
4912 /* force LE */
4913 iorder[0] = 0;
4914 iorder[1] = 1;
4915 iorder[2] = 2;
4916 iorder[3] = 3;
4917 }
4918 else if (byteorder == 1) {
4919 /* force BE */
4920 iorder[0] = 3;
4921 iorder[1] = 2;
4922 iorder[2] = 1;
4923 iorder[3] = 0;
4924 }
4925
4926 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004927 Py_UCS4 ch = *s++;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00004929 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4930 Py_UCS4 ch2 = *s;
4931 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4932 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4933 s++;
4934 size--;
4935 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004936 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937#endif
4938 STORECHAR(ch);
4939 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00004940
4941 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004942 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004943#undef STORECHAR
4944}
4945
Alexander Belopolsky40018472011-02-26 01:02:56 +00004946PyObject *
4947PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004948{
4949 if (!PyUnicode_Check(unicode)) {
4950 PyErr_BadArgument();
4951 return NULL;
4952 }
4953 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00004954 PyUnicode_GET_SIZE(unicode),
4955 NULL,
4956 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00004957}
4958
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959/* --- UTF-16 Codec ------------------------------------------------------- */
4960
Tim Peters772747b2001-08-09 22:21:55 +00004961PyObject *
4962PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004963 Py_ssize_t size,
4964 const char *errors,
4965 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966{
Walter Dörwald69652032004-09-07 20:24:22 +00004967 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4968}
4969
Antoine Pitrouab868312009-01-10 15:40:25 +00004970/* Two masks for fast checking of whether a C 'long' may contain
4971 UTF16-encoded surrogate characters. This is an efficient heuristic,
4972 assuming that non-surrogate characters with a code point >= 0x8000 are
4973 rare in most input.
4974 FAST_CHAR_MASK is used when the input is in native byte ordering,
4975 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00004976*/
Antoine Pitrouab868312009-01-10 15:40:25 +00004977#if (SIZEOF_LONG == 8)
4978# define FAST_CHAR_MASK 0x8000800080008000L
4979# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4980#elif (SIZEOF_LONG == 4)
4981# define FAST_CHAR_MASK 0x80008000L
4982# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4983#else
4984# error C 'long' size should be either 4 or 8!
4985#endif
4986
Walter Dörwald69652032004-09-07 20:24:22 +00004987PyObject *
4988PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 Py_ssize_t size,
4990 const char *errors,
4991 int *byteorder,
4992 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00004993{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004994 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004995 Py_ssize_t startinpos;
4996 Py_ssize_t endinpos;
4997 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 PyUnicodeObject *unicode;
4999 Py_UNICODE *p;
Antoine Pitrouab868312009-01-10 15:40:25 +00005000 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005001 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005002 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005003 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005004 /* Offsets from q for retrieving byte pairs in the right order. */
5005#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5006 int ihi = 1, ilo = 0;
5007#else
5008 int ihi = 0, ilo = 1;
5009#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005010 PyObject *errorHandler = NULL;
5011 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
5013 /* Note: size will always be longer than the resulting Unicode
5014 character count */
5015 unicode = _PyUnicode_New(size);
5016 if (!unicode)
5017 return NULL;
5018 if (size == 0)
5019 return (PyObject *)unicode;
5020
5021 /* Unpack UTF-16 encoded data */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005022 p = PyUnicode_AS_UNICODE(unicode);
Tim Peters772747b2001-08-09 22:21:55 +00005023 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005024 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025
5026 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005027 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005028
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005029 /* Check for BOM marks (U+FEFF) in the input and adjust current
5030 byte order setting accordingly. In native mode, the leading BOM
5031 mark is skipped, in all other modes, it is copied to the output
5032 stream as-is (giving a ZWNBSP character). */
5033 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005034 if (size >= 2) {
5035 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005036#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005037 if (bom == 0xFEFF) {
5038 q += 2;
5039 bo = -1;
5040 }
5041 else if (bom == 0xFFFE) {
5042 q += 2;
5043 bo = 1;
5044 }
Tim Petersced69f82003-09-16 20:30:58 +00005045#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005046 if (bom == 0xFEFF) {
5047 q += 2;
5048 bo = 1;
5049 }
5050 else if (bom == 0xFFFE) {
5051 q += 2;
5052 bo = -1;
5053 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005054#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057
Tim Peters772747b2001-08-09 22:21:55 +00005058 if (bo == -1) {
5059 /* force LE */
5060 ihi = 1;
5061 ilo = 0;
5062 }
5063 else if (bo == 1) {
5064 /* force BE */
5065 ihi = 0;
5066 ilo = 1;
5067 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005068#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5069 native_ordering = ilo < ihi;
5070#else
5071 native_ordering = ilo > ihi;
5072#endif
Tim Peters772747b2001-08-09 22:21:55 +00005073
Antoine Pitrouab868312009-01-10 15:40:25 +00005074 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005075 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005076 Py_UNICODE ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005077 /* First check for possible aligned read of a C 'long'. Unaligned
5078 reads are more expensive, better to defer to another iteration. */
5079 if (!((size_t) q & LONG_PTR_MASK)) {
5080 /* Fast path for runs of non-surrogate chars. */
5081 register const unsigned char *_q = q;
5082 Py_UNICODE *_p = p;
5083 if (native_ordering) {
5084 /* Native ordering is simple: as long as the input cannot
5085 possibly contain a surrogate char, do an unrolled copy
5086 of several 16-bit code points to the target object.
5087 The non-surrogate check is done on several input bytes
5088 at a time (as many as a C 'long' can contain). */
5089 while (_q < aligned_end) {
5090 unsigned long data = * (unsigned long *) _q;
5091 if (data & FAST_CHAR_MASK)
5092 break;
5093 _p[0] = ((unsigned short *) _q)[0];
5094 _p[1] = ((unsigned short *) _q)[1];
5095#if (SIZEOF_LONG == 8)
5096 _p[2] = ((unsigned short *) _q)[2];
5097 _p[3] = ((unsigned short *) _q)[3];
5098#endif
5099 _q += SIZEOF_LONG;
5100 _p += SIZEOF_LONG / 2;
5101 }
5102 }
5103 else {
5104 /* Byteswapped ordering is similar, but we must decompose
5105 the copy bytewise, and take care of zero'ing out the
5106 upper bytes if the target object is in 32-bit units
5107 (that is, in UCS-4 builds). */
5108 while (_q < aligned_end) {
5109 unsigned long data = * (unsigned long *) _q;
5110 if (data & SWAPPED_FAST_CHAR_MASK)
5111 break;
5112 /* Zero upper bytes in UCS-4 builds */
5113#if (Py_UNICODE_SIZE > 2)
5114 _p[0] = 0;
5115 _p[1] = 0;
5116#if (SIZEOF_LONG == 8)
5117 _p[2] = 0;
5118 _p[3] = 0;
5119#endif
5120#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005121 /* Issue #4916; UCS-4 builds on big endian machines must
5122 fill the two last bytes of each 4-byte unit. */
5123#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5124# define OFF 2
5125#else
5126# define OFF 0
Antoine Pitrouab868312009-01-10 15:40:25 +00005127#endif
Antoine Pitroud6e8de12009-01-11 23:56:55 +00005128 ((unsigned char *) _p)[OFF + 1] = _q[0];
5129 ((unsigned char *) _p)[OFF + 0] = _q[1];
5130 ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5131 ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5132#if (SIZEOF_LONG == 8)
5133 ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5134 ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5135 ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5136 ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5137#endif
5138#undef OFF
Antoine Pitrouab868312009-01-10 15:40:25 +00005139 _q += SIZEOF_LONG;
5140 _p += SIZEOF_LONG / 2;
5141 }
5142 }
5143 p = _p;
5144 q = _q;
5145 if (q >= e)
5146 break;
5147 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005148 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005149
Benjamin Peterson14339b62009-01-31 16:36:08 +00005150 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005151
5152 if (ch < 0xD800 || ch > 0xDFFF) {
5153 *p++ = ch;
5154 continue;
5155 }
5156
5157 /* UTF-16 code pair: */
5158 if (q > e) {
5159 errmsg = "unexpected end of data";
5160 startinpos = (((const char *)q) - 2) - starts;
5161 endinpos = ((const char *)e) + 1 - starts;
5162 goto utf16Error;
5163 }
5164 if (0xD800 <= ch && ch <= 0xDBFF) {
5165 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5166 q += 2;
5167 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00005168#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005169 *p++ = ch;
5170 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005171#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005172 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005173#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005174 continue;
5175 }
5176 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005177 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005178 startinpos = (((const char *)q)-4)-starts;
5179 endinpos = startinpos+2;
5180 goto utf16Error;
5181 }
5182
Benjamin Peterson14339b62009-01-31 16:36:08 +00005183 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005184 errmsg = "illegal encoding";
5185 startinpos = (((const char *)q)-2)-starts;
5186 endinpos = startinpos+2;
5187 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005188
Benjamin Peterson29060642009-01-31 22:14:21 +00005189 utf16Error:
5190 outpos = p - PyUnicode_AS_UNICODE(unicode);
5191 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005192 errors,
5193 &errorHandler,
5194 "utf16", errmsg,
5195 &starts,
5196 (const char **)&e,
5197 &startinpos,
5198 &endinpos,
5199 &exc,
5200 (const char **)&q,
5201 &unicode,
5202 &outpos,
5203 &p))
Benjamin Peterson29060642009-01-31 22:14:21 +00005204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005206 /* remaining byte at the end? (size should be even) */
5207 if (e == q) {
5208 if (!consumed) {
5209 errmsg = "truncated data";
5210 startinpos = ((const char *)q) - starts;
5211 endinpos = ((const char *)e) + 1 - starts;
5212 outpos = p - PyUnicode_AS_UNICODE(unicode);
5213 if (unicode_decode_call_errorhandler(
5214 errors,
5215 &errorHandler,
5216 "utf16", errmsg,
5217 &starts,
5218 (const char **)&e,
5219 &startinpos,
5220 &endinpos,
5221 &exc,
5222 (const char **)&q,
5223 &unicode,
5224 &outpos,
5225 &p))
5226 goto onError;
5227 /* The remaining input chars are ignored if the callback
5228 chooses to skip the input */
5229 }
5230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
5232 if (byteorder)
5233 *byteorder = bo;
5234
Walter Dörwald69652032004-09-07 20:24:22 +00005235 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005236 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005237
Guido van Rossumd57fd912000-03-10 22:53:23 +00005238 /* Adjust length */
Victor Stinnerfe226c02011-10-03 03:52:20 +02005239 if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240 goto onError;
5241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 Py_XDECREF(errorHandler);
5243 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005244#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005245 if (_PyUnicode_READY_REPLACE(&unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005246 Py_DECREF(unicode);
5247 return NULL;
5248 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005249#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005250 assert(_PyUnicode_CheckConsistency(unicode, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 return (PyObject *)unicode;
5252
Benjamin Peterson29060642009-01-31 22:14:21 +00005253 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005255 Py_XDECREF(errorHandler);
5256 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 return NULL;
5258}
5259
Antoine Pitrouab868312009-01-10 15:40:25 +00005260#undef FAST_CHAR_MASK
5261#undef SWAPPED_FAST_CHAR_MASK
5262
Tim Peters772747b2001-08-09 22:21:55 +00005263PyObject *
5264PyUnicode_EncodeUTF16(const Py_UNICODE *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 Py_ssize_t size,
5266 const char *errors,
5267 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005269 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005270 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005271 Py_ssize_t nsize, bytesize;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005272#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005273 Py_ssize_t i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005274#else
5275 const int pairs = 0;
5276#endif
Tim Peters772747b2001-08-09 22:21:55 +00005277 /* Offsets from p for storing byte pairs in the right order. */
5278#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5279 int ihi = 1, ilo = 0;
5280#else
5281 int ihi = 0, ilo = 1;
5282#endif
5283
Benjamin Peterson29060642009-01-31 22:14:21 +00005284#define STORECHAR(CH) \
5285 do { \
5286 p[ihi] = ((CH) >> 8) & 0xff; \
5287 p[ilo] = (CH) & 0xff; \
5288 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005289 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005291#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005292 for (i = pairs = 0; i < size; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +00005293 if (s[i] >= 0x10000)
5294 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005295#endif
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005296 /* 2 * (size + pairs + (byteorder == 0)) */
5297 if (size > PY_SSIZE_T_MAX ||
5298 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005299 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005300 nsize = size + pairs + (byteorder == 0);
5301 bytesize = nsize * 2;
5302 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005303 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005304 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305 if (v == NULL)
5306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005308 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005310 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005311 if (size == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005312 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005313
5314 if (byteorder == -1) {
5315 /* force LE */
5316 ihi = 1;
5317 ilo = 0;
5318 }
5319 else if (byteorder == 1) {
5320 /* force BE */
5321 ihi = 0;
5322 ilo = 1;
5323 }
5324
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005325 while (size-- > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005326 Py_UNICODE ch = *s++;
5327 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005328#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005329 if (ch >= 0x10000) {
5330 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5331 ch = 0xD800 | ((ch-0x10000) >> 10);
5332 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00005333#endif
Tim Peters772747b2001-08-09 22:21:55 +00005334 STORECHAR(ch);
5335 if (ch2)
5336 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005337 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005338
5339 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005340 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005341#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342}
5343
Alexander Belopolsky40018472011-02-26 01:02:56 +00005344PyObject *
5345PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346{
5347 if (!PyUnicode_Check(unicode)) {
5348 PyErr_BadArgument();
5349 return NULL;
5350 }
5351 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00005352 PyUnicode_GET_SIZE(unicode),
5353 NULL,
5354 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355}
5356
5357/* --- Unicode Escape Codec ----------------------------------------------- */
5358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5360 if all the escapes in the string make it still a valid ASCII string.
5361 Returns -1 if any escapes were found which cause the string to
5362 pop out of ASCII range. Otherwise returns the length of the
5363 required buffer to hold the string.
5364 */
5365Py_ssize_t
5366length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5367{
5368 const unsigned char *p = (const unsigned char *)s;
5369 const unsigned char *end = p + size;
5370 Py_ssize_t length = 0;
5371
5372 if (size < 0)
5373 return -1;
5374
5375 for (; p < end; ++p) {
5376 if (*p > 127) {
5377 /* Non-ASCII */
5378 return -1;
5379 }
5380 else if (*p != '\\') {
5381 /* Normal character */
5382 ++length;
5383 }
5384 else {
5385 /* Backslash-escape, check next char */
5386 ++p;
5387 /* Escape sequence reaches till end of string or
5388 non-ASCII follow-up. */
5389 if (p >= end || *p > 127)
5390 return -1;
5391 switch (*p) {
5392 case '\n':
5393 /* backslash + \n result in zero characters */
5394 break;
5395 case '\\': case '\'': case '\"':
5396 case 'b': case 'f': case 't':
5397 case 'n': case 'r': case 'v': case 'a':
5398 ++length;
5399 break;
5400 case '0': case '1': case '2': case '3':
5401 case '4': case '5': case '6': case '7':
5402 case 'x': case 'u': case 'U': case 'N':
5403 /* these do not guarantee ASCII characters */
5404 return -1;
5405 default:
5406 /* count the backslash + the other character */
5407 length += 2;
5408 }
5409 }
5410 }
5411 return length;
5412}
5413
5414/* Similar to PyUnicode_WRITE but either write into wstr field
5415 or treat string as ASCII. */
5416#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5417 do { \
5418 if ((kind) != PyUnicode_WCHAR_KIND) \
5419 ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5420 else \
5421 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5422 } while (0)
5423
5424#define WRITE_WSTR(buf, index, value) \
5425 assert(kind == PyUnicode_WCHAR_KIND), \
5426 ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5427
5428
Fredrik Lundh06d12682001-01-24 07:59:11 +00005429static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005430
Alexander Belopolsky40018472011-02-26 01:02:56 +00005431PyObject *
5432PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005433 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005434 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005436 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005437 Py_ssize_t startinpos;
5438 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439 int j;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005441 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005443 char* message;
5444 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005445 PyObject *errorHandler = NULL;
5446 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005447 Py_ssize_t ascii_length;
5448 Py_ssize_t i;
5449 int kind;
5450 void *data;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 ascii_length = length_of_escaped_ascii_string(s, size);
5453
5454 /* After length_of_escaped_ascii_string() there are two alternatives,
5455 either the string is pure ASCII with named escapes like \n, etc.
5456 and we determined it's exact size (common case)
5457 or it contains \x, \u, ... escape sequences. then we create a
5458 legacy wchar string and resize it at the end of this function. */
5459 if (ascii_length >= 0) {
5460 v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5461 if (!v)
5462 goto onError;
5463 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5464 kind = PyUnicode_1BYTE_KIND;
5465 data = PyUnicode_DATA(v);
5466 }
5467 else {
5468 /* Escaped strings will always be longer than the resulting
5469 Unicode string, so we start with size here and then reduce the
5470 length after conversion to the true value.
5471 (but if the error callback returns a long replacement string
5472 we'll have to allocate more space) */
5473 v = _PyUnicode_New(size);
5474 if (!v)
5475 goto onError;
5476 kind = PyUnicode_WCHAR_KIND;
5477 data = PyUnicode_AS_UNICODE(v);
5478 }
5479
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480 if (size == 0)
5481 return (PyObject *)v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005482 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005484
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 while (s < end) {
5486 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00005487 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005488 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005490 if (kind == PyUnicode_WCHAR_KIND) {
5491 assert(i < _PyUnicode_WSTR_LENGTH(v));
5492 }
5493 else {
5494 /* The only case in which i == ascii_length is a backslash
5495 followed by a newline. */
5496 assert(i <= ascii_length);
5497 }
5498
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 /* Non-escape characters are interpreted as Unicode ordinals */
5500 if (*s != '\\') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005501 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005502 continue;
5503 }
5504
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005505 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 /* \ - Escapes */
5507 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005508 c = *s++;
5509 if (s > end)
5510 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005511
5512 if (kind == PyUnicode_WCHAR_KIND) {
5513 assert(i < _PyUnicode_WSTR_LENGTH(v));
5514 }
5515 else {
5516 /* The only case in which i == ascii_length is a backslash
5517 followed by a newline. */
5518 assert(i < ascii_length || (i == ascii_length && c == '\n'));
5519 }
5520
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005521 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 /* \x escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524 case '\n': break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005525 case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5526 case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5527 case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5528 case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5529 /* FF */
5530 case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5531 case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5532 case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5533 case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5534 /* VT */
5535 case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5536 /* BEL, not classic C */
5537 case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538
Benjamin Peterson29060642009-01-31 22:14:21 +00005539 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 case '0': case '1': case '2': case '3':
5541 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005542 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005543 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005544 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005545 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005546 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548 WRITE_WSTR(data, i++, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 break;
5550
Benjamin Peterson29060642009-01-31 22:14:21 +00005551 /* hex escapes */
5552 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005554 digits = 2;
5555 message = "truncated \\xXX escape";
5556 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
Benjamin Peterson29060642009-01-31 22:14:21 +00005558 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005559 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005560 digits = 4;
5561 message = "truncated \\uXXXX escape";
5562 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005563
Benjamin Peterson29060642009-01-31 22:14:21 +00005564 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005565 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005566 digits = 8;
5567 message = "truncated \\UXXXXXXXX escape";
5568 hexescape:
5569 chr = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005571 if (s+digits>end) {
5572 endinpos = size;
5573 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 errors, &errorHandler,
5575 "unicodeescape", "end of string in escape sequence",
5576 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 &v, &i, &p))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005578 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005579 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005580 goto nextByte;
5581 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005582 for (j = 0; j < digits; ++j) {
5583 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005584 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585 endinpos = (s+j+1)-starts;
5586 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005587 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005588 errors, &errorHandler,
5589 "unicodeescape", message,
5590 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005591 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005592 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005593 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005594 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005595 }
5596 chr = (chr<<4) & ~0xF;
5597 if (c >= '0' && c <= '9')
5598 chr += c - '0';
5599 else if (c >= 'a' && c <= 'f')
5600 chr += 10 + c - 'a';
5601 else
5602 chr += 10 + c - 'A';
5603 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005604 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005605 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005606 /* _decoding_error will have already written into the
5607 target buffer. */
5608 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005609 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005610 /* when we get here, chr is a 32-bit unicode character */
5611 if (chr <= 0xffff)
5612 /* UCS-2 character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 WRITE_WSTR(data, i++, chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005614 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005615 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00005616 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00005617#ifdef Py_UNICODE_WIDE
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618 WRITE_WSTR(data, i++, chr);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005619#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00005620 chr -= 0x10000L;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5622 WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005623#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00005624 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005625 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005626 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005627 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005628 errors, &errorHandler,
5629 "unicodeescape", "illegal Unicode character",
5630 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005631 &v, &i, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005632 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005633 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005634 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005635 break;
5636
Benjamin Peterson29060642009-01-31 22:14:21 +00005637 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005638 case 'N':
5639 message = "malformed \\N character escape";
5640 if (ucnhash_CAPI == NULL) {
5641 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5643 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005644 if (ucnhash_CAPI == NULL)
5645 goto ucnhashError;
5646 }
5647 if (*s == '{') {
5648 const char *start = s+1;
5649 /* look for the closing brace */
5650 while (*s != '}' && s < end)
5651 s++;
5652 if (s > start && s < end && *s == '}') {
5653 /* found a name. look it up in the unicode database */
5654 message = "unknown Unicode character name";
5655 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005656 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5657 &chr))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005658 goto store;
5659 }
5660 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005661 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005662 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005663 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005664 errors, &errorHandler,
5665 "unicodeescape", message,
5666 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005667 &v, &i, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005668 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669 data = PyUnicode_AS_UNICODE(v);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 break;
5671
5672 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005673 if (s > end) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005674 assert(kind == PyUnicode_WCHAR_KIND);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005675 message = "\\ at end of string";
5676 s--;
5677 endinpos = s-starts;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678 p = PyUnicode_AS_UNICODE(v) + i;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005680 errors, &errorHandler,
5681 "unicodeescape", message,
5682 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005683 &v, &i, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00005684 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 data = PyUnicode_AS_UNICODE(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005686 }
5687 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5689 WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005690 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005691 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005692 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005694 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 /* Ensure the length prediction worked in case of ASCII strings */
5697 assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5698
Victor Stinnerfe226c02011-10-03 03:52:20 +02005699 if (kind == PyUnicode_WCHAR_KIND)
5700 {
5701 if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5702 goto onError;
Victor Stinnerfe226c02011-10-03 03:52:20 +02005703 }
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005704 Py_XDECREF(errorHandler);
5705 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02005706#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02005707 if (_PyUnicode_READY_REPLACE(&v)) {
5708 Py_DECREF(v);
5709 return NULL;
5710 }
Victor Stinner17efeed2011-10-04 20:05:46 +02005711#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02005712 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00005714
Benjamin Peterson29060642009-01-31 22:14:21 +00005715 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005716 PyErr_SetString(
5717 PyExc_UnicodeError,
5718 "\\N escapes not supported (can't load unicodedata module)"
5719 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005720 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005721 Py_XDECREF(errorHandler);
5722 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005723 return NULL;
5724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005727 Py_XDECREF(errorHandler);
5728 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 return NULL;
5730}
5731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005732#undef WRITE_ASCII_OR_WSTR
5733#undef WRITE_WSTR
5734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735/* Return a Unicode-Escape string version of the Unicode object.
5736
5737 If quotes is true, the string is enclosed in u"" or u'' quotes as
5738 appropriate.
5739
5740*/
5741
Walter Dörwald79e913e2007-05-12 11:08:06 +00005742static const char *hexdigits = "0123456789abcdef";
5743
Alexander Belopolsky40018472011-02-26 01:02:56 +00005744PyObject *
5745PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005746 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005748 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005749 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005751#ifdef Py_UNICODE_WIDE
5752 const Py_ssize_t expandsize = 10;
5753#else
5754 const Py_ssize_t expandsize = 6;
5755#endif
5756
Thomas Wouters89f507f2006-12-13 04:49:30 +00005757 /* XXX(nnorwitz): rather than over-allocating, it would be
5758 better to choose a different scheme. Perhaps scan the
5759 first N-chars of the string and allocate based on that size.
5760 */
5761 /* Initial allocation is based on the longest-possible unichr
5762 escape.
5763
5764 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5765 unichr, so in this case it's the longest unichr escape. In
5766 narrow (UTF-16) builds this is five chars per source unichr
5767 since there are two unichrs in the surrogate pair, so in narrow
5768 (UTF-16) builds it's not the longest unichr escape.
5769
5770 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5771 so in the narrow (UTF-16) build case it's the longest unichr
5772 escape.
5773 */
5774
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005775 if (size == 0)
5776 return PyBytes_FromStringAndSize(NULL, 0);
5777
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005778 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005779 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005780
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005781 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005782 2
5783 + expandsize*size
5784 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005785 if (repr == NULL)
5786 return NULL;
5787
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005788 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790 while (size-- > 0) {
5791 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005792
Walter Dörwald79e913e2007-05-12 11:08:06 +00005793 /* Escape backslashes */
5794 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005795 *p++ = '\\';
5796 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005797 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005798 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005799
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00005800#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005801 /* Map 21-bit characters to '\U00xxxxxx' */
5802 else if (ch >= 0x10000) {
5803 *p++ = '\\';
5804 *p++ = 'U';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005805 *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5806 *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5807 *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5808 *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5809 *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5810 *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5811 *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5812 *p++ = hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005814 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005815#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005816 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5817 else if (ch >= 0xD800 && ch < 0xDC00) {
5818 Py_UNICODE ch2;
5819 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00005820
Benjamin Peterson29060642009-01-31 22:14:21 +00005821 ch2 = *s++;
5822 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00005823 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5825 *p++ = '\\';
5826 *p++ = 'U';
5827 *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5828 *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5829 *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5830 *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5831 *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5832 *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5833 *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5834 *p++ = hexdigits[ucs & 0x0000000F];
5835 continue;
5836 }
5837 /* Fall through: isolated surrogates are copied as-is */
5838 s--;
5839 size++;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005840 }
Thomas Wouters89f507f2006-12-13 04:49:30 +00005841#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005842
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005844 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 *p++ = '\\';
5846 *p++ = 'u';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005847 *p++ = hexdigits[(ch >> 12) & 0x000F];
5848 *p++ = hexdigits[(ch >> 8) & 0x000F];
5849 *p++ = hexdigits[(ch >> 4) & 0x000F];
5850 *p++ = hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005852
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005853 /* Map special whitespace to '\t', \n', '\r' */
5854 else if (ch == '\t') {
5855 *p++ = '\\';
5856 *p++ = 't';
5857 }
5858 else if (ch == '\n') {
5859 *p++ = '\\';
5860 *p++ = 'n';
5861 }
5862 else if (ch == '\r') {
5863 *p++ = '\\';
5864 *p++ = 'r';
5865 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005866
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005867 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005868 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005870 *p++ = 'x';
Walter Dörwald79e913e2007-05-12 11:08:06 +00005871 *p++ = hexdigits[(ch >> 4) & 0x000F];
5872 *p++ = hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005873 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005874
Guido van Rossumd57fd912000-03-10 22:53:23 +00005875 /* Copy everything else as-is */
5876 else
5877 *p++ = (char) ch;
5878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005880 assert(p - PyBytes_AS_STRING(repr) > 0);
5881 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5882 return NULL;
5883 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884}
5885
Alexander Belopolsky40018472011-02-26 01:02:56 +00005886PyObject *
5887PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005888{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005889 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 if (!PyUnicode_Check(unicode)) {
5891 PyErr_BadArgument();
5892 return NULL;
5893 }
Walter Dörwald79e913e2007-05-12 11:08:06 +00005894 s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5895 PyUnicode_GET_SIZE(unicode));
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00005896 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897}
5898
5899/* --- Raw Unicode Escape Codec ------------------------------------------- */
5900
Alexander Belopolsky40018472011-02-26 01:02:56 +00005901PyObject *
5902PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005903 Py_ssize_t size,
5904 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005906 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005907 Py_ssize_t startinpos;
5908 Py_ssize_t endinpos;
5909 Py_ssize_t outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005911 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 const char *end;
5913 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005914 PyObject *errorHandler = NULL;
5915 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005916
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 /* Escaped strings will always be longer than the resulting
5918 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005919 length after conversion to the true value. (But decoding error
5920 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 v = _PyUnicode_New(size);
5922 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00005923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005925 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005926 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 end = s + size;
5928 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 unsigned char c;
5930 Py_UCS4 x;
5931 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005932 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 /* Non-escape characters are interpreted as Unicode ordinals */
5935 if (*s != '\\') {
5936 *p++ = (unsigned char)*s++;
5937 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005938 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005939 startinpos = s-starts;
5940
5941 /* \u-escapes are only interpreted iff the number of leading
5942 backslashes if odd */
5943 bs = s;
5944 for (;s < end;) {
5945 if (*s != '\\')
5946 break;
5947 *p++ = (unsigned char)*s++;
5948 }
5949 if (((s - bs) & 1) == 0 ||
5950 s >= end ||
5951 (*s != 'u' && *s != 'U')) {
5952 continue;
5953 }
5954 p--;
5955 count = *s=='u' ? 4 : 8;
5956 s++;
5957
5958 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5959 outpos = p-PyUnicode_AS_UNICODE(v);
5960 for (x = 0, i = 0; i < count; ++i, ++s) {
5961 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00005962 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 endinpos = s-starts;
5964 if (unicode_decode_call_errorhandler(
5965 errors, &errorHandler,
5966 "rawunicodeescape", "truncated \\uXXXX",
5967 &starts, &end, &startinpos, &endinpos, &exc, &s,
5968 &v, &outpos, &p))
5969 goto onError;
5970 goto nextByte;
5971 }
5972 x = (x<<4) & ~0xF;
5973 if (c >= '0' && c <= '9')
5974 x += c - '0';
5975 else if (c >= 'a' && c <= 'f')
5976 x += 10 + c - 'a';
5977 else
5978 x += 10 + c - 'A';
5979 }
Christian Heimesfe337bf2008-03-23 21:54:12 +00005980 if (x <= 0xffff)
Benjamin Peterson29060642009-01-31 22:14:21 +00005981 /* UCS-2 character */
5982 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005983 else if (x <= 0x10ffff) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005984 /* UCS-4 character. Either store directly, or as
5985 surrogate pair. */
Christian Heimesfe337bf2008-03-23 21:54:12 +00005986#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00005987 *p++ = (Py_UNICODE) x;
Christian Heimesfe337bf2008-03-23 21:54:12 +00005988#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005989 x -= 0x10000L;
5990 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5991 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
Christian Heimesfe337bf2008-03-23 21:54:12 +00005992#endif
5993 } else {
5994 endinpos = s-starts;
5995 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00005996 if (unicode_decode_call_errorhandler(
5997 errors, &errorHandler,
5998 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00005999 &starts, &end, &startinpos, &endinpos, &exc, &s,
6000 &v, &outpos, &p))
6001 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006002 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006003 nextByte:
6004 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006005 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02006006 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006008 Py_XDECREF(errorHandler);
6009 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006010#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006011 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006012 Py_DECREF(v);
6013 return NULL;
6014 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006015#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006016 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006018
Benjamin Peterson29060642009-01-31 22:14:21 +00006019 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006021 Py_XDECREF(errorHandler);
6022 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023 return NULL;
6024}
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
6027PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006028 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006030 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 char *p;
6032 char *q;
6033
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006034#ifdef Py_UNICODE_WIDE
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006035 const Py_ssize_t expandsize = 10;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006036#else
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006037 const Py_ssize_t expandsize = 6;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006038#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00006039
Neal Norwitz3ce5d922008-08-24 07:08:55 +00006040 if (size > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006041 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006042
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006043 repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006044 if (repr == NULL)
6045 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00006046 if (size == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006047 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006049 p = q = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 while (size-- > 0) {
6051 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006052#ifdef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006053 /* Map 32-bit characters to '\Uxxxxxxxx' */
6054 if (ch >= 0x10000) {
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006055 *p++ = '\\';
6056 *p++ = 'U';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006057 *p++ = hexdigits[(ch >> 28) & 0xf];
6058 *p++ = hexdigits[(ch >> 24) & 0xf];
6059 *p++ = hexdigits[(ch >> 20) & 0xf];
6060 *p++ = hexdigits[(ch >> 16) & 0xf];
6061 *p++ = hexdigits[(ch >> 12) & 0xf];
6062 *p++ = hexdigits[(ch >> 8) & 0xf];
6063 *p++ = hexdigits[(ch >> 4) & 0xf];
6064 *p++ = hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006065 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006066 else
Christian Heimesfe337bf2008-03-23 21:54:12 +00006067#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6069 if (ch >= 0xD800 && ch < 0xDC00) {
6070 Py_UNICODE ch2;
6071 Py_UCS4 ucs;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006072
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 ch2 = *s++;
6074 size--;
Georg Brandl78eef3de2010-08-01 20:51:02 +00006075 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6077 *p++ = '\\';
6078 *p++ = 'U';
6079 *p++ = hexdigits[(ucs >> 28) & 0xf];
6080 *p++ = hexdigits[(ucs >> 24) & 0xf];
6081 *p++ = hexdigits[(ucs >> 20) & 0xf];
6082 *p++ = hexdigits[(ucs >> 16) & 0xf];
6083 *p++ = hexdigits[(ucs >> 12) & 0xf];
6084 *p++ = hexdigits[(ucs >> 8) & 0xf];
6085 *p++ = hexdigits[(ucs >> 4) & 0xf];
6086 *p++ = hexdigits[ucs & 0xf];
6087 continue;
6088 }
6089 /* Fall through: isolated surrogates are copied as-is */
6090 s--;
6091 size++;
6092 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006093#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006094 /* Map 16-bit characters to '\uxxxx' */
6095 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006096 *p++ = '\\';
6097 *p++ = 'u';
Walter Dörwalddb5d33e2007-05-12 11:13:47 +00006098 *p++ = hexdigits[(ch >> 12) & 0xf];
6099 *p++ = hexdigits[(ch >> 8) & 0xf];
6100 *p++ = hexdigits[(ch >> 4) & 0xf];
6101 *p++ = hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006103 /* Copy everything else as-is */
6104 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105 *p++ = (char) ch;
6106 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006107 size = p - q;
6108
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006109 assert(size > 0);
6110 if (_PyBytes_Resize(&repr, size) < 0)
6111 return NULL;
6112 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113}
6114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
6116PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117{
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006118 PyObject *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 if (!PyUnicode_Check(unicode)) {
Walter Dörwald711005d2007-05-12 12:03:26 +00006120 PyErr_BadArgument();
6121 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 }
Walter Dörwald711005d2007-05-12 12:03:26 +00006123 s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6124 PyUnicode_GET_SIZE(unicode));
6125
Alexandre Vassalotti9cb6f7f2008-12-27 09:09:15 +00006126 return s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127}
6128
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006129/* --- Unicode Internal Codec ------------------------------------------- */
6130
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyObject *
6132_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006133 Py_ssize_t size,
6134 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006135{
6136 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006137 Py_ssize_t startinpos;
6138 Py_ssize_t endinpos;
6139 Py_ssize_t outpos;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006140 PyUnicodeObject *v;
6141 Py_UNICODE *p;
6142 const char *end;
6143 const char *reason;
6144 PyObject *errorHandler = NULL;
6145 PyObject *exc = NULL;
6146
Neal Norwitzd43069c2006-01-08 01:12:10 +00006147#ifdef Py_UNICODE_WIDE
6148 Py_UNICODE unimax = PyUnicode_GetMax();
6149#endif
6150
Thomas Wouters89f507f2006-12-13 04:49:30 +00006151 /* XXX overflow detection missing */
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006152 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6153 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006155 /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6156 as string was created with the old API. */
6157 if (PyUnicode_GET_SIZE(v) == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 return (PyObject *)v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006159 p = PyUnicode_AS_UNICODE(v);
6160 end = s + size;
6161
6162 while (s < end) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00006163 memcpy(p, s, sizeof(Py_UNICODE));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006164 /* We have to sanity check the raw data, otherwise doom looms for
6165 some malformed UCS-4 data. */
6166 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006167#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006168 *p > unimax || *p < 0 ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006169#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006170 end-s < Py_UNICODE_SIZE
6171 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006173 startinpos = s - starts;
6174 if (end-s < Py_UNICODE_SIZE) {
6175 endinpos = end-starts;
6176 reason = "truncated input";
6177 }
6178 else {
6179 endinpos = s - starts + Py_UNICODE_SIZE;
6180 reason = "illegal code point (> 0x10FFFF)";
6181 }
6182 outpos = p - PyUnicode_AS_UNICODE(v);
6183 if (unicode_decode_call_errorhandler(
6184 errors, &errorHandler,
6185 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006186 &starts, &end, &startinpos, &endinpos, &exc, &s,
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00006187 &v, &outpos, &p)) {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006188 goto onError;
6189 }
6190 }
6191 else {
6192 p++;
6193 s += Py_UNICODE_SIZE;
6194 }
6195 }
6196
Victor Stinnerfe226c02011-10-03 03:52:20 +02006197 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006198 goto onError;
6199 Py_XDECREF(errorHandler);
6200 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006201#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006202 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006203 Py_DECREF(v);
6204 return NULL;
6205 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006206#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006207 assert(_PyUnicode_CheckConsistency(v, 1));
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208 return (PyObject *)v;
6209
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006211 Py_XDECREF(v);
6212 Py_XDECREF(errorHandler);
6213 Py_XDECREF(exc);
6214 return NULL;
6215}
6216
Guido van Rossumd57fd912000-03-10 22:53:23 +00006217/* --- Latin-1 Codec ------------------------------------------------------ */
6218
Alexander Belopolsky40018472011-02-26 01:02:56 +00006219PyObject *
6220PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006221 Py_ssize_t size,
6222 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006225 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226}
6227
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006228/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006229static void
6230make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006231 const char *encoding,
6232 const Py_UNICODE *unicode, Py_ssize_t size,
6233 Py_ssize_t startpos, Py_ssize_t endpos,
6234 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006236 if (*exceptionObject == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006237 *exceptionObject = PyUnicodeEncodeError_Create(
6238 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 }
6240 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006241 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6242 goto onError;
6243 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6244 goto onError;
6245 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6246 goto onError;
6247 return;
6248 onError:
6249 Py_DECREF(*exceptionObject);
6250 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
6252}
6253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006254/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006255static void
6256raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006257 const char *encoding,
6258 const Py_UNICODE *unicode, Py_ssize_t size,
6259 Py_ssize_t startpos, Py_ssize_t endpos,
6260 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006261{
6262 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006263 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006264 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006265 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006266}
6267
6268/* error handling callback helper:
6269 build arguments, call the callback and check the arguments,
6270 put the result into newpos and return the replacement string, which
6271 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006272static PyObject *
6273unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006274 PyObject **errorHandler,
6275 const char *encoding, const char *reason,
6276 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6277 Py_ssize_t startpos, Py_ssize_t endpos,
6278 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006279{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006280 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281
6282 PyObject *restuple;
6283 PyObject *resunicode;
6284
6285 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006286 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006289 }
6290
6291 make_encode_exception(exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00006292 encoding, unicode, size, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006293 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006294 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006295
6296 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006298 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006299 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006300 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006301 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006302 Py_DECREF(restuple);
6303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006304 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006305 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 &resunicode, newpos)) {
6307 Py_DECREF(restuple);
6308 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006309 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006310 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6311 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6312 Py_DECREF(restuple);
6313 return NULL;
6314 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315 if (*newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006316 *newpos = size+*newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006317 if (*newpos<0 || *newpos>size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006318 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6319 Py_DECREF(restuple);
6320 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006322 Py_INCREF(resunicode);
6323 Py_DECREF(restuple);
6324 return resunicode;
6325}
6326
Alexander Belopolsky40018472011-02-26 01:02:56 +00006327static PyObject *
6328unicode_encode_ucs1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006329 Py_ssize_t size,
6330 const char *errors,
6331 int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006332{
6333 /* output object */
6334 PyObject *res;
6335 /* pointers to the beginning and end+1 of input */
6336 const Py_UNICODE *startp = p;
6337 const Py_UNICODE *endp = p + size;
6338 /* pointer to the beginning of the unencodable characters */
6339 /* const Py_UNICODE *badp = NULL; */
6340 /* pointer into the output */
6341 char *str;
6342 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006343 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006344 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6345 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 PyObject *errorHandler = NULL;
6347 PyObject *exc = NULL;
6348 /* the following variable is used for caching string comparisons
6349 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6350 int known_errorHandler = -1;
6351
6352 /* allocate enough for a simple encoding without
6353 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006354 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006355 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006356 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006357 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006358 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006359 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 ressize = size;
6361
6362 while (p<endp) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006363 Py_UNICODE c = *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006364
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 /* can we encode this? */
6366 if (c<limit) {
6367 /* no overflow check, because we know that the space is enough */
6368 *str++ = (char)c;
6369 ++p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006371 else {
6372 Py_ssize_t unicodepos = p-startp;
6373 Py_ssize_t requiredsize;
6374 PyObject *repunicode;
6375 Py_ssize_t repsize;
6376 Py_ssize_t newpos;
6377 Py_ssize_t respos;
6378 Py_UNICODE *uni2;
6379 /* startpos for collecting unencodable chars */
6380 const Py_UNICODE *collstart = p;
6381 const Py_UNICODE *collend = p;
6382 /* find all unecodable characters */
6383 while ((collend < endp) && ((*collend)>=limit))
6384 ++collend;
6385 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6386 if (known_errorHandler==-1) {
6387 if ((errors==NULL) || (!strcmp(errors, "strict")))
6388 known_errorHandler = 1;
6389 else if (!strcmp(errors, "replace"))
6390 known_errorHandler = 2;
6391 else if (!strcmp(errors, "ignore"))
6392 known_errorHandler = 3;
6393 else if (!strcmp(errors, "xmlcharrefreplace"))
6394 known_errorHandler = 4;
6395 else
6396 known_errorHandler = 0;
6397 }
6398 switch (known_errorHandler) {
6399 case 1: /* strict */
6400 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6401 goto onError;
6402 case 2: /* replace */
6403 while (collstart++<collend)
6404 *str++ = '?'; /* fall through */
6405 case 3: /* ignore */
6406 p = collend;
6407 break;
6408 case 4: /* xmlcharrefreplace */
6409 respos = str - PyBytes_AS_STRING(res);
6410 /* determine replacement size (temporarily (mis)uses p) */
6411 for (p = collstart, repsize = 0; p < collend; ++p) {
6412 if (*p<10)
6413 repsize += 2+1+1;
6414 else if (*p<100)
6415 repsize += 2+2+1;
6416 else if (*p<1000)
6417 repsize += 2+3+1;
6418 else if (*p<10000)
6419 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006420#ifndef Py_UNICODE_WIDE
Benjamin Peterson29060642009-01-31 22:14:21 +00006421 else
6422 repsize += 2+5+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00006423#else
Benjamin Peterson29060642009-01-31 22:14:21 +00006424 else if (*p<100000)
6425 repsize += 2+5+1;
6426 else if (*p<1000000)
6427 repsize += 2+6+1;
6428 else
6429 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00006430#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 }
6432 requiredsize = respos+repsize+(endp-collend);
6433 if (requiredsize > ressize) {
6434 if (requiredsize<2*ressize)
6435 requiredsize = 2*ressize;
6436 if (_PyBytes_Resize(&res, requiredsize))
6437 goto onError;
6438 str = PyBytes_AS_STRING(res) + respos;
6439 ressize = requiredsize;
6440 }
6441 /* generate replacement (temporarily (mis)uses p) */
6442 for (p = collstart; p < collend; ++p) {
6443 str += sprintf(str, "&#%d;", (int)*p);
6444 }
6445 p = collend;
6446 break;
6447 default:
6448 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6449 encoding, reason, startp, size, &exc,
6450 collstart-startp, collend-startp, &newpos);
6451 if (repunicode == NULL)
6452 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006453 if (PyBytes_Check(repunicode)) {
6454 /* Directly copy bytes result to output. */
6455 repsize = PyBytes_Size(repunicode);
6456 if (repsize > 1) {
6457 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006458 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006459 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6460 Py_DECREF(repunicode);
6461 goto onError;
6462 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006463 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006464 ressize += repsize-1;
6465 }
6466 memcpy(str, PyBytes_AsString(repunicode), repsize);
6467 str += repsize;
6468 p = startp + newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006469 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006470 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006471 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 /* need more space? (at least enough for what we
6473 have+the replacement+the rest of the string, so
6474 we won't have to check space for encodable characters) */
6475 respos = str - PyBytes_AS_STRING(res);
6476 repsize = PyUnicode_GET_SIZE(repunicode);
6477 requiredsize = respos+repsize+(endp-collend);
6478 if (requiredsize > ressize) {
6479 if (requiredsize<2*ressize)
6480 requiredsize = 2*ressize;
6481 if (_PyBytes_Resize(&res, requiredsize)) {
6482 Py_DECREF(repunicode);
6483 goto onError;
6484 }
6485 str = PyBytes_AS_STRING(res) + respos;
6486 ressize = requiredsize;
6487 }
6488 /* check if there is anything unencodable in the replacement
6489 and copy it to the output */
6490 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6491 c = *uni2;
6492 if (c >= limit) {
6493 raise_encode_exception(&exc, encoding, startp, size,
6494 unicodepos, unicodepos+1, reason);
6495 Py_DECREF(repunicode);
6496 goto onError;
6497 }
6498 *str = (char)c;
6499 }
6500 p = startp + newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006501 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006502 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006503 }
6504 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006505 /* Resize if we allocated to much */
6506 size = str - PyBytes_AS_STRING(res);
6507 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006508 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006509 if (_PyBytes_Resize(&res, size) < 0)
6510 goto onError;
6511 }
6512
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006513 Py_XDECREF(errorHandler);
6514 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006515 return res;
6516
6517 onError:
6518 Py_XDECREF(res);
6519 Py_XDECREF(errorHandler);
6520 Py_XDECREF(exc);
6521 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006522}
6523
Alexander Belopolsky40018472011-02-26 01:02:56 +00006524PyObject *
6525PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006526 Py_ssize_t size,
6527 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006529 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530}
6531
Alexander Belopolsky40018472011-02-26 01:02:56 +00006532PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006533_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534{
6535 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006536 PyErr_BadArgument();
6537 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006539 if (PyUnicode_READY(unicode) == -1)
6540 return NULL;
6541 /* Fast path: if it is a one-byte string, construct
6542 bytes object directly. */
6543 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6544 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6545 PyUnicode_GET_LENGTH(unicode));
6546 /* Non-Latin-1 characters present. Defer to above function to
6547 raise the exception. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006549 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006550 errors);
6551}
6552
6553PyObject*
6554PyUnicode_AsLatin1String(PyObject *unicode)
6555{
6556 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557}
6558
6559/* --- 7-bit ASCII Codec -------------------------------------------------- */
6560
Alexander Belopolsky40018472011-02-26 01:02:56 +00006561PyObject *
6562PyUnicode_DecodeASCII(const char *s,
6563 Py_ssize_t size,
6564 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006566 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 PyUnicodeObject *v;
Victor Stinner702c7342011-10-05 13:50:52 +02006568 Py_UNICODE *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006569 Py_ssize_t startinpos;
6570 Py_ssize_t endinpos;
6571 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006572 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006573 int has_error;
6574 const unsigned char *p = (const unsigned char *)s;
6575 const unsigned char *end = p + size;
6576 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006577 PyObject *errorHandler = NULL;
6578 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006579
Guido van Rossumd57fd912000-03-10 22:53:23 +00006580 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006581 if (size == 1 && (unsigned char)s[0] < 128)
6582 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006583
Victor Stinner702c7342011-10-05 13:50:52 +02006584 has_error = 0;
6585 while (p < end && !has_error) {
6586 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6587 an explanation. */
6588 if (!((size_t) p & LONG_PTR_MASK)) {
6589 /* Help register allocation */
6590 register const unsigned char *_p = p;
6591 while (_p < aligned_end) {
6592 unsigned long value = *(unsigned long *) _p;
6593 if (value & ASCII_CHAR_MASK) {
6594 has_error = 1;
6595 break;
6596 }
6597 _p += SIZEOF_LONG;
6598 }
6599 if (_p == end)
6600 break;
6601 if (has_error)
6602 break;
6603 p = _p;
6604 }
6605 if (*p & 0x80) {
6606 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006607 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006608 }
6609 else {
6610 ++p;
6611 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006612 }
Victor Stinner702c7342011-10-05 13:50:52 +02006613 if (!has_error)
6614 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006615
Guido van Rossumd57fd912000-03-10 22:53:23 +00006616 v = _PyUnicode_New(size);
6617 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006618 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006620 return (PyObject *)v;
Victor Stinner702c7342011-10-05 13:50:52 +02006621 u = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006622 e = s + size;
6623 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006624 register unsigned char c = (unsigned char)*s;
6625 if (c < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02006626 *u++ = c;
Benjamin Peterson29060642009-01-31 22:14:21 +00006627 ++s;
6628 }
6629 else {
6630 startinpos = s-starts;
6631 endinpos = startinpos + 1;
Victor Stinner702c7342011-10-05 13:50:52 +02006632 outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 if (unicode_decode_call_errorhandler(
6634 errors, &errorHandler,
6635 "ascii", "ordinal not in range(128)",
6636 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinner702c7342011-10-05 13:50:52 +02006637 &v, &outpos, &u))
Benjamin Peterson29060642009-01-31 22:14:21 +00006638 goto onError;
6639 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 }
Victor Stinner702c7342011-10-05 13:50:52 +02006641 if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6642 if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006643 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006644 Py_XDECREF(errorHandler);
6645 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02006646#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006647 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006648 Py_DECREF(v);
6649 return NULL;
6650 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006651#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006652 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00006654
Benjamin Peterson29060642009-01-31 22:14:21 +00006655 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 Py_XDECREF(errorHandler);
6658 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006659 return NULL;
6660}
6661
Alexander Belopolsky40018472011-02-26 01:02:56 +00006662PyObject *
6663PyUnicode_EncodeASCII(const Py_UNICODE *p,
6664 Py_ssize_t size,
6665 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006667 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668}
6669
Alexander Belopolsky40018472011-02-26 01:02:56 +00006670PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006671_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672{
6673 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006674 PyErr_BadArgument();
6675 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006677 if (PyUnicode_READY(unicode) == -1)
6678 return NULL;
6679 /* Fast path: if it is an ASCII-only string, construct bytes object
6680 directly. Else defer to above function to raise the exception. */
6681 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6682 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6683 PyUnicode_GET_LENGTH(unicode));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006685 PyUnicode_GET_SIZE(unicode),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006686 errors);
6687}
6688
6689PyObject *
6690PyUnicode_AsASCIIString(PyObject *unicode)
6691{
6692 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693}
6694
Victor Stinner99b95382011-07-04 14:23:54 +02006695#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006696
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006697/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006698
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006699#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006700#define NEED_RETRY
6701#endif
6702
6703/* XXX This code is limited to "true" double-byte encodings, as
6704 a) it assumes an incomplete character consists of a single byte, and
6705 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
Benjamin Peterson29060642009-01-31 22:14:21 +00006706 encodings, see IsDBCSLeadByteEx documentation. */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006707
Alexander Belopolsky40018472011-02-26 01:02:56 +00006708static int
6709is_dbcs_lead_byte(const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006710{
6711 const char *curr = s + offset;
6712
6713 if (IsDBCSLeadByte(*curr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 const char *prev = CharPrev(s, curr);
6715 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006716 }
6717 return 0;
6718}
6719
6720/*
6721 * Decode MBCS string into unicode object. If 'final' is set, converts
6722 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6723 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006724static int
6725decode_mbcs(PyUnicodeObject **v,
6726 const char *s, /* MBCS string */
6727 int size, /* sizeof MBCS string */
6728 int final,
6729 const char *errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006730{
6731 Py_UNICODE *p;
Victor Stinner554f3f02010-06-16 23:33:54 +00006732 Py_ssize_t n;
6733 DWORD usize;
6734 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006735
6736 assert(size >= 0);
6737
Victor Stinner554f3f02010-06-16 23:33:54 +00006738 /* check and handle 'errors' arg */
6739 if (errors==NULL || strcmp(errors, "strict")==0)
6740 flags = MB_ERR_INVALID_CHARS;
6741 else if (strcmp(errors, "ignore")==0)
6742 flags = 0;
6743 else {
6744 PyErr_Format(PyExc_ValueError,
6745 "mbcs encoding does not support errors='%s'",
6746 errors);
6747 return -1;
6748 }
6749
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006750 /* Skip trailing lead-byte unless 'final' is set */
6751 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 --size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006753
6754 /* First get the size of the result */
6755 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006756 usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6757 if (usize==0)
6758 goto mbcs_decode_error;
6759 } else
6760 usize = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006761
6762 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 /* Create unicode object */
6764 *v = _PyUnicode_New(usize);
6765 if (*v == NULL)
6766 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006767 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006768 }
6769 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006770 /* Extend unicode object */
6771 n = PyUnicode_GET_SIZE(*v);
Victor Stinner2fd82272011-10-03 04:06:05 +02006772 if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006773 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006774 }
6775
6776 /* Do the conversion */
Victor Stinner554f3f02010-06-16 23:33:54 +00006777 if (usize > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 p = PyUnicode_AS_UNICODE(*v) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006779 if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6780 goto mbcs_decode_error;
Benjamin Peterson29060642009-01-31 22:14:21 +00006781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006783 return size;
Victor Stinner554f3f02010-06-16 23:33:54 +00006784
6785mbcs_decode_error:
6786 /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6787 we raise a UnicodeDecodeError - else it is a 'generic'
6788 windows error
6789 */
6790 if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6791 /* Ideally, we should get reason from FormatMessage - this
6792 is the Windows 2000 English version of the message
6793 */
6794 PyObject *exc = NULL;
6795 const char *reason = "No mapping for the Unicode character exists "
6796 "in the target multi-byte code page.";
6797 make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6798 if (exc != NULL) {
6799 PyCodec_StrictErrors(exc);
6800 Py_DECREF(exc);
6801 }
6802 } else {
6803 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6804 }
6805 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806}
6807
Alexander Belopolsky40018472011-02-26 01:02:56 +00006808PyObject *
6809PyUnicode_DecodeMBCSStateful(const char *s,
6810 Py_ssize_t size,
6811 const char *errors,
6812 Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006813{
6814 PyUnicodeObject *v = NULL;
6815 int done;
6816
6817 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
6820#ifdef NEED_RETRY
6821 retry:
6822 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006823 done = decode_mbcs(&v, s, INT_MAX, 0, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824 else
6825#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006826 done = decode_mbcs(&v, s, (int)size, !consumed, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006827
6828 if (done < 0) {
6829 Py_XDECREF(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831 }
6832
6833 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006834 *consumed += done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835
6836#ifdef NEED_RETRY
6837 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 s += done;
6839 size -= done;
6840 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006841 }
6842#endif
Victor Stinner17efeed2011-10-04 20:05:46 +02006843#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02006844 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006845 Py_DECREF(v);
6846 return NULL;
6847 }
Victor Stinner17efeed2011-10-04 20:05:46 +02006848#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006849 assert(_PyUnicode_CheckConsistency(v, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006850 return (PyObject *)v;
6851}
6852
Alexander Belopolsky40018472011-02-26 01:02:56 +00006853PyObject *
6854PyUnicode_DecodeMBCS(const char *s,
6855 Py_ssize_t size,
6856 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006857{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006858 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6859}
6860
6861/*
6862 * Convert unicode into string object (MBCS).
6863 * Returns 0 if succeed, -1 otherwise.
6864 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006865static int
6866encode_mbcs(PyObject **repr,
6867 const Py_UNICODE *p, /* unicode */
6868 int size, /* size of unicode */
6869 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006870{
Victor Stinner554f3f02010-06-16 23:33:54 +00006871 BOOL usedDefaultChar = FALSE;
6872 BOOL *pusedDefaultChar;
6873 int mbcssize;
6874 Py_ssize_t n;
6875 PyObject *exc = NULL;
6876 DWORD flags;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006877
6878 assert(size >= 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006879
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 /* check and handle 'errors' arg */
6881 if (errors==NULL || strcmp(errors, "strict")==0) {
6882 flags = WC_NO_BEST_FIT_CHARS;
6883 pusedDefaultChar = &usedDefaultChar;
6884 } else if (strcmp(errors, "replace")==0) {
6885 flags = 0;
6886 pusedDefaultChar = NULL;
6887 } else {
6888 PyErr_Format(PyExc_ValueError,
6889 "mbcs encoding does not support errors='%s'",
6890 errors);
6891 return -1;
6892 }
6893
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006894 /* First get the size of the result */
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895 if (size > 0) {
Victor Stinner554f3f02010-06-16 23:33:54 +00006896 mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6897 NULL, pusedDefaultChar);
Benjamin Peterson29060642009-01-31 22:14:21 +00006898 if (mbcssize == 0) {
6899 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6900 return -1;
6901 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006902 /* If we used a default char, then we failed! */
6903 if (pusedDefaultChar && *pusedDefaultChar)
6904 goto mbcs_encode_error;
6905 } else {
6906 mbcssize = 0;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006907 }
6908
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006909 if (*repr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006910 /* Create string object */
6911 *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6912 if (*repr == NULL)
6913 return -1;
Victor Stinner554f3f02010-06-16 23:33:54 +00006914 n = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006915 }
6916 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006917 /* Extend string object */
6918 n = PyBytes_Size(*repr);
6919 if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6920 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006921 }
6922
6923 /* Do the conversion */
6924 if (size > 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006925 char *s = PyBytes_AS_STRING(*repr) + n;
Victor Stinner554f3f02010-06-16 23:33:54 +00006926 if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6927 NULL, pusedDefaultChar)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006928 PyErr_SetFromWindowsErrWithFilename(0, NULL);
6929 return -1;
6930 }
Victor Stinner554f3f02010-06-16 23:33:54 +00006931 if (pusedDefaultChar && *pusedDefaultChar)
6932 goto mbcs_encode_error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006933 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006934 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00006935
6936mbcs_encode_error:
6937 raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6938 Py_XDECREF(exc);
6939 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006940}
6941
Alexander Belopolsky40018472011-02-26 01:02:56 +00006942PyObject *
6943PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6944 Py_ssize_t size,
6945 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006946{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006947 PyObject *repr = NULL;
6948 int ret;
Guido van Rossum03e29f12000-05-04 15:52:20 +00006949
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006950#ifdef NEED_RETRY
Benjamin Peterson29060642009-01-31 22:14:21 +00006951 retry:
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006952 if (size > INT_MAX)
Victor Stinner554f3f02010-06-16 23:33:54 +00006953 ret = encode_mbcs(&repr, p, INT_MAX, errors);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006954 else
6955#endif
Victor Stinner554f3f02010-06-16 23:33:54 +00006956 ret = encode_mbcs(&repr, p, (int)size, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006957
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006958 if (ret < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 Py_XDECREF(repr);
6960 return NULL;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006961 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006962
6963#ifdef NEED_RETRY
6964 if (size > INT_MAX) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006965 p += INT_MAX;
6966 size -= INT_MAX;
6967 goto retry;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006968 }
6969#endif
6970
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006971 return repr;
6972}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006973
Alexander Belopolsky40018472011-02-26 01:02:56 +00006974PyObject *
6975PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006976{
6977 if (!PyUnicode_Check(unicode)) {
6978 PyErr_BadArgument();
6979 return NULL;
6980 }
6981 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00006982 PyUnicode_GET_SIZE(unicode),
6983 NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00006984}
6985
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006986#undef NEED_RETRY
6987
Victor Stinner99b95382011-07-04 14:23:54 +02006988#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006989
Guido van Rossumd57fd912000-03-10 22:53:23 +00006990/* --- Character Mapping Codec -------------------------------------------- */
6991
Alexander Belopolsky40018472011-02-26 01:02:56 +00006992PyObject *
6993PyUnicode_DecodeCharmap(const char *s,
6994 Py_ssize_t size,
6995 PyObject *mapping,
6996 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006998 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006999 Py_ssize_t startinpos;
7000 Py_ssize_t endinpos;
7001 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007002 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007003 PyUnicodeObject *v;
7004 Py_UNICODE *p;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007005 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007006 PyObject *errorHandler = NULL;
7007 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007008 Py_UNICODE *mapstring = NULL;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007009 Py_ssize_t maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00007010
Guido van Rossumd57fd912000-03-10 22:53:23 +00007011 /* Default to Latin-1 */
7012 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007013 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007014
7015 v = _PyUnicode_New(size);
7016 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007018 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 return (PyObject *)v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007022 if (PyUnicode_CheckExact(mapping)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 mapstring = PyUnicode_AS_UNICODE(mapping);
7024 maplen = PyUnicode_GET_SIZE(mapping);
7025 while (s < e) {
7026 unsigned char ch = *s;
7027 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028
Benjamin Peterson29060642009-01-31 22:14:21 +00007029 if (ch < maplen)
7030 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031
Benjamin Peterson29060642009-01-31 22:14:21 +00007032 if (x == 0xfffe) {
7033 /* undefined mapping */
7034 outpos = p-PyUnicode_AS_UNICODE(v);
7035 startinpos = s-starts;
7036 endinpos = startinpos+1;
7037 if (unicode_decode_call_errorhandler(
7038 errors, &errorHandler,
7039 "charmap", "character maps to <undefined>",
7040 &starts, &e, &startinpos, &endinpos, &exc, &s,
7041 &v, &outpos, &p)) {
7042 goto onError;
7043 }
7044 continue;
7045 }
7046 *p++ = x;
7047 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007048 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007049 }
7050 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007051 while (s < e) {
7052 unsigned char ch = *s;
7053 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007054
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7056 w = PyLong_FromLong((long)ch);
7057 if (w == NULL)
7058 goto onError;
7059 x = PyObject_GetItem(mapping, w);
7060 Py_DECREF(w);
7061 if (x == NULL) {
7062 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7063 /* No mapping found means: mapping is undefined. */
7064 PyErr_Clear();
7065 x = Py_None;
7066 Py_INCREF(x);
7067 } else
7068 goto onError;
7069 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007070
Benjamin Peterson29060642009-01-31 22:14:21 +00007071 /* Apply mapping */
7072 if (PyLong_Check(x)) {
7073 long value = PyLong_AS_LONG(x);
7074 if (value < 0 || value > 65535) {
7075 PyErr_SetString(PyExc_TypeError,
7076 "character mapping must be in range(65536)");
7077 Py_DECREF(x);
7078 goto onError;
7079 }
7080 *p++ = (Py_UNICODE)value;
7081 }
7082 else if (x == Py_None) {
7083 /* undefined mapping */
7084 outpos = p-PyUnicode_AS_UNICODE(v);
7085 startinpos = s-starts;
7086 endinpos = startinpos+1;
7087 if (unicode_decode_call_errorhandler(
7088 errors, &errorHandler,
7089 "charmap", "character maps to <undefined>",
7090 &starts, &e, &startinpos, &endinpos, &exc, &s,
7091 &v, &outpos, &p)) {
7092 Py_DECREF(x);
7093 goto onError;
7094 }
7095 Py_DECREF(x);
7096 continue;
7097 }
7098 else if (PyUnicode_Check(x)) {
7099 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007100
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 if (targetsize == 1)
7102 /* 1-1 mapping */
7103 *p++ = *PyUnicode_AS_UNICODE(x);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007104
Benjamin Peterson29060642009-01-31 22:14:21 +00007105 else if (targetsize > 1) {
7106 /* 1-n mapping */
7107 if (targetsize > extrachars) {
7108 /* resize first */
7109 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7110 Py_ssize_t needed = (targetsize - extrachars) + \
7111 (targetsize << 2);
7112 extrachars += needed;
7113 /* XXX overflow detection missing */
Victor Stinnerfe226c02011-10-03 03:52:20 +02007114 if (PyUnicode_Resize((PyObject**)&v,
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 PyUnicode_GET_SIZE(v) + needed) < 0) {
7116 Py_DECREF(x);
7117 goto onError;
7118 }
7119 p = PyUnicode_AS_UNICODE(v) + oldpos;
7120 }
7121 Py_UNICODE_COPY(p,
7122 PyUnicode_AS_UNICODE(x),
7123 targetsize);
7124 p += targetsize;
7125 extrachars -= targetsize;
7126 }
7127 /* 1-0 mapping: skip the character */
7128 }
7129 else {
7130 /* wrong return value */
7131 PyErr_SetString(PyExc_TypeError,
7132 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007133 Py_DECREF(x);
7134 goto onError;
7135 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 Py_DECREF(x);
7137 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007138 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007139 }
7140 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Victor Stinnerfe226c02011-10-03 03:52:20 +02007141 if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 Py_XDECREF(errorHandler);
7144 Py_XDECREF(exc);
Victor Stinner17efeed2011-10-04 20:05:46 +02007145#ifndef DONT_MAKE_RESULT_READY
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02007146 if (_PyUnicode_READY_REPLACE(&v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007147 Py_DECREF(v);
7148 return NULL;
7149 }
Victor Stinner17efeed2011-10-04 20:05:46 +02007150#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02007151 assert(_PyUnicode_CheckConsistency(v, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00007153
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007155 Py_XDECREF(errorHandler);
7156 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007157 Py_XDECREF(v);
7158 return NULL;
7159}
7160
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007161/* Charmap encoding: the lookup table */
7162
Alexander Belopolsky40018472011-02-26 01:02:56 +00007163struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 PyObject_HEAD
7165 unsigned char level1[32];
7166 int count2, count3;
7167 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007168};
7169
7170static PyObject*
7171encoding_map_size(PyObject *obj, PyObject* args)
7172{
7173 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007174 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007176}
7177
7178static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007179 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007180 PyDoc_STR("Return the size (in bytes) of this object") },
7181 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007182};
7183
7184static void
7185encoding_map_dealloc(PyObject* o)
7186{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007187 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007188}
7189
7190static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007191 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 "EncodingMap", /*tp_name*/
7193 sizeof(struct encoding_map), /*tp_basicsize*/
7194 0, /*tp_itemsize*/
7195 /* methods */
7196 encoding_map_dealloc, /*tp_dealloc*/
7197 0, /*tp_print*/
7198 0, /*tp_getattr*/
7199 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007200 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 0, /*tp_repr*/
7202 0, /*tp_as_number*/
7203 0, /*tp_as_sequence*/
7204 0, /*tp_as_mapping*/
7205 0, /*tp_hash*/
7206 0, /*tp_call*/
7207 0, /*tp_str*/
7208 0, /*tp_getattro*/
7209 0, /*tp_setattro*/
7210 0, /*tp_as_buffer*/
7211 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7212 0, /*tp_doc*/
7213 0, /*tp_traverse*/
7214 0, /*tp_clear*/
7215 0, /*tp_richcompare*/
7216 0, /*tp_weaklistoffset*/
7217 0, /*tp_iter*/
7218 0, /*tp_iternext*/
7219 encoding_map_methods, /*tp_methods*/
7220 0, /*tp_members*/
7221 0, /*tp_getset*/
7222 0, /*tp_base*/
7223 0, /*tp_dict*/
7224 0, /*tp_descr_get*/
7225 0, /*tp_descr_set*/
7226 0, /*tp_dictoffset*/
7227 0, /*tp_init*/
7228 0, /*tp_alloc*/
7229 0, /*tp_new*/
7230 0, /*tp_free*/
7231 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007232};
7233
7234PyObject*
7235PyUnicode_BuildEncodingMap(PyObject* string)
7236{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007237 PyObject *result;
7238 struct encoding_map *mresult;
7239 int i;
7240 int need_dict = 0;
7241 unsigned char level1[32];
7242 unsigned char level2[512];
7243 unsigned char *mlevel1, *mlevel2, *mlevel3;
7244 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007245 int kind;
7246 void *data;
7247 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007249 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007250 PyErr_BadArgument();
7251 return NULL;
7252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007253 kind = PyUnicode_KIND(string);
7254 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007255 memset(level1, 0xFF, sizeof level1);
7256 memset(level2, 0xFF, sizeof level2);
7257
7258 /* If there isn't a one-to-one mapping of NULL to \0,
7259 or if there are non-BMP characters, we need to use
7260 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007261 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007262 need_dict = 1;
7263 for (i = 1; i < 256; i++) {
7264 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007265 ch = PyUnicode_READ(kind, data, i);
7266 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007267 need_dict = 1;
7268 break;
7269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007270 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007271 /* unmapped character */
7272 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273 l1 = ch >> 11;
7274 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007275 if (level1[l1] == 0xFF)
7276 level1[l1] = count2++;
7277 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007278 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007279 }
7280
7281 if (count2 >= 0xFF || count3 >= 0xFF)
7282 need_dict = 1;
7283
7284 if (need_dict) {
7285 PyObject *result = PyDict_New();
7286 PyObject *key, *value;
7287 if (!result)
7288 return NULL;
7289 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007291 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007292 if (!key || !value)
7293 goto failed1;
7294 if (PyDict_SetItem(result, key, value) == -1)
7295 goto failed1;
7296 Py_DECREF(key);
7297 Py_DECREF(value);
7298 }
7299 return result;
7300 failed1:
7301 Py_XDECREF(key);
7302 Py_XDECREF(value);
7303 Py_DECREF(result);
7304 return NULL;
7305 }
7306
7307 /* Create a three-level trie */
7308 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7309 16*count2 + 128*count3 - 1);
7310 if (!result)
7311 return PyErr_NoMemory();
7312 PyObject_Init(result, &EncodingMapType);
7313 mresult = (struct encoding_map*)result;
7314 mresult->count2 = count2;
7315 mresult->count3 = count3;
7316 mlevel1 = mresult->level1;
7317 mlevel2 = mresult->level23;
7318 mlevel3 = mresult->level23 + 16*count2;
7319 memcpy(mlevel1, level1, 32);
7320 memset(mlevel2, 0xFF, 16*count2);
7321 memset(mlevel3, 0, 128*count3);
7322 count3 = 0;
7323 for (i = 1; i < 256; i++) {
7324 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007325 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007326 /* unmapped character */
7327 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007328 o1 = PyUnicode_READ(kind, data, i)>>11;
7329 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007330 i2 = 16*mlevel1[o1] + o2;
7331 if (mlevel2[i2] == 0xFF)
7332 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007333 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007334 i3 = 128*mlevel2[i2] + o3;
7335 mlevel3[i3] = i;
7336 }
7337 return result;
7338}
7339
7340static int
7341encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7342{
7343 struct encoding_map *map = (struct encoding_map*)mapping;
7344 int l1 = c>>11;
7345 int l2 = (c>>7) & 0xF;
7346 int l3 = c & 0x7F;
7347 int i;
7348
7349#ifdef Py_UNICODE_WIDE
7350 if (c > 0xFFFF) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007351 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007352 }
7353#endif
7354 if (c == 0)
7355 return 0;
7356 /* level 1*/
7357 i = map->level1[l1];
7358 if (i == 0xFF) {
7359 return -1;
7360 }
7361 /* level 2*/
7362 i = map->level23[16*i+l2];
7363 if (i == 0xFF) {
7364 return -1;
7365 }
7366 /* level 3 */
7367 i = map->level23[16*map->count2 + 128*i + l3];
7368 if (i == 0) {
7369 return -1;
7370 }
7371 return i;
7372}
7373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007374/* Lookup the character ch in the mapping. If the character
7375 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007376 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007377static PyObject *
7378charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379{
Christian Heimes217cfd12007-12-02 14:31:20 +00007380 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007381 PyObject *x;
7382
7383 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 x = PyObject_GetItem(mapping, w);
7386 Py_DECREF(w);
7387 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007388 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7389 /* No mapping found means: mapping is undefined. */
7390 PyErr_Clear();
7391 x = Py_None;
7392 Py_INCREF(x);
7393 return x;
7394 } else
7395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007397 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007398 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007399 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007400 long value = PyLong_AS_LONG(x);
7401 if (value < 0 || value > 255) {
7402 PyErr_SetString(PyExc_TypeError,
7403 "character mapping must be in range(256)");
7404 Py_DECREF(x);
7405 return NULL;
7406 }
7407 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007408 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007409 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007410 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007412 /* wrong return value */
7413 PyErr_Format(PyExc_TypeError,
7414 "character mapping must return integer, bytes or None, not %.400s",
7415 x->ob_type->tp_name);
7416 Py_DECREF(x);
7417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007418 }
7419}
7420
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007421static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007422charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007423{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007424 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7425 /* exponentially overallocate to minimize reallocations */
7426 if (requiredsize < 2*outsize)
7427 requiredsize = 2*outsize;
7428 if (_PyBytes_Resize(outobj, requiredsize))
7429 return -1;
7430 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007431}
7432
Benjamin Peterson14339b62009-01-31 16:36:08 +00007433typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007434 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007435} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007436/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007437 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007438 space is available. Return a new reference to the object that
7439 was put in the output buffer, or Py_None, if the mapping was undefined
7440 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007441 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007442static charmapencode_result
7443charmapencode_output(Py_UNICODE c, PyObject *mapping,
7444 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007445{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007446 PyObject *rep;
7447 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00007448 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007449
Christian Heimes90aa7642007-12-19 02:45:37 +00007450 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007451 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007452 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007453 if (res == -1)
7454 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 if (outsize<requiredsize)
7456 if (charmapencode_resize(outobj, outpos, requiredsize))
7457 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00007458 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 outstart[(*outpos)++] = (char)res;
7460 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007461 }
7462
7463 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007464 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007466 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007467 Py_DECREF(rep);
7468 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007469 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007470 if (PyLong_Check(rep)) {
7471 Py_ssize_t requiredsize = *outpos+1;
7472 if (outsize<requiredsize)
7473 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7474 Py_DECREF(rep);
7475 return enc_EXCEPTION;
7476 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007477 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007478 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007479 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007480 else {
7481 const char *repchars = PyBytes_AS_STRING(rep);
7482 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7483 Py_ssize_t requiredsize = *outpos+repsize;
7484 if (outsize<requiredsize)
7485 if (charmapencode_resize(outobj, outpos, requiredsize)) {
7486 Py_DECREF(rep);
7487 return enc_EXCEPTION;
7488 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007489 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00007490 memcpy(outstart + *outpos, repchars, repsize);
7491 *outpos += repsize;
7492 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007493 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007494 Py_DECREF(rep);
7495 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007496}
7497
7498/* handle an error in PyUnicode_EncodeCharmap
7499 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007500static int
7501charmap_encoding_error(
Martin v. Löwis18e16552006-02-15 17:27:45 +00007502 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007503 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00007504 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00007505 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007506{
7507 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007508 Py_ssize_t repsize;
7509 Py_ssize_t newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007510 Py_UNICODE *uni2;
7511 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007512 Py_ssize_t collstartpos = *inpos;
7513 Py_ssize_t collendpos = *inpos+1;
7514 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007515 char *encoding = "charmap";
7516 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007517 charmapencode_result x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007519 /* find all unencodable characters */
7520 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007521 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00007522 if (Py_TYPE(mapping) == &EncodingMapType) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007523 int res = encoding_map_lookup(p[collendpos], mapping);
7524 if (res != -1)
7525 break;
7526 ++collendpos;
7527 continue;
7528 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007529
Benjamin Peterson29060642009-01-31 22:14:21 +00007530 rep = charmapencode_lookup(p[collendpos], mapping);
7531 if (rep==NULL)
7532 return -1;
7533 else if (rep!=Py_None) {
7534 Py_DECREF(rep);
7535 break;
7536 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007537 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00007538 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007539 }
7540 /* cache callback name lookup
7541 * (if not done yet, i.e. it's the first error) */
7542 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007543 if ((errors==NULL) || (!strcmp(errors, "strict")))
7544 *known_errorHandler = 1;
7545 else if (!strcmp(errors, "replace"))
7546 *known_errorHandler = 2;
7547 else if (!strcmp(errors, "ignore"))
7548 *known_errorHandler = 3;
7549 else if (!strcmp(errors, "xmlcharrefreplace"))
7550 *known_errorHandler = 4;
7551 else
7552 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007553 }
7554 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007555 case 1: /* strict */
7556 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7557 return -1;
7558 case 2: /* replace */
7559 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007560 x = charmapencode_output('?', mapping, res, respos);
7561 if (x==enc_EXCEPTION) {
7562 return -1;
7563 }
7564 else if (x==enc_FAILED) {
7565 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7566 return -1;
7567 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007568 }
7569 /* fall through */
7570 case 3: /* ignore */
7571 *inpos = collendpos;
7572 break;
7573 case 4: /* xmlcharrefreplace */
7574 /* generate replacement (temporarily (mis)uses p) */
7575 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007576 char buffer[2+29+1+1];
7577 char *cp;
7578 sprintf(buffer, "&#%d;", (int)p[collpos]);
7579 for (cp = buffer; *cp; ++cp) {
7580 x = charmapencode_output(*cp, mapping, res, respos);
7581 if (x==enc_EXCEPTION)
7582 return -1;
7583 else if (x==enc_FAILED) {
7584 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7585 return -1;
7586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007587 }
7588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007589 *inpos = collendpos;
7590 break;
7591 default:
7592 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 encoding, reason, p, size, exceptionObject,
7594 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007595 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00007597 if (PyBytes_Check(repunicode)) {
7598 /* Directly copy bytes result to output. */
7599 Py_ssize_t outsize = PyBytes_Size(*res);
7600 Py_ssize_t requiredsize;
7601 repsize = PyBytes_Size(repunicode);
7602 requiredsize = *respos + repsize;
7603 if (requiredsize > outsize)
7604 /* Make room for all additional bytes. */
7605 if (charmapencode_resize(res, respos, requiredsize)) {
7606 Py_DECREF(repunicode);
7607 return -1;
7608 }
7609 memcpy(PyBytes_AsString(*res) + *respos,
7610 PyBytes_AsString(repunicode), repsize);
7611 *respos += repsize;
7612 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007613 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00007614 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007615 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007616 /* generate replacement */
7617 repsize = PyUnicode_GET_SIZE(repunicode);
7618 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007619 x = charmapencode_output(*uni2, mapping, res, respos);
7620 if (x==enc_EXCEPTION) {
7621 return -1;
7622 }
7623 else if (x==enc_FAILED) {
7624 Py_DECREF(repunicode);
7625 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7626 return -1;
7627 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007628 }
7629 *inpos = newpos;
7630 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007631 }
7632 return 0;
7633}
7634
Alexander Belopolsky40018472011-02-26 01:02:56 +00007635PyObject *
7636PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7637 Py_ssize_t size,
7638 PyObject *mapping,
7639 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007640{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007641 /* output object */
7642 PyObject *res = NULL;
7643 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007644 Py_ssize_t inpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007645 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00007646 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007647 PyObject *errorHandler = NULL;
7648 PyObject *exc = NULL;
7649 /* the following variable is used for caching string comparisons
7650 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7651 * 3=ignore, 4=xmlcharrefreplace */
7652 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007653
7654 /* Default to Latin-1 */
7655 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007656 return PyUnicode_EncodeLatin1(p, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007658 /* allocate enough for a simple encoding without
7659 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00007660 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007661 if (res == NULL)
7662 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00007663 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007664 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007666 while (inpos<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007667 /* try to encode it */
7668 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7669 if (x==enc_EXCEPTION) /* error */
7670 goto onError;
7671 if (x==enc_FAILED) { /* unencodable character */
7672 if (charmap_encoding_error(p, size, &inpos, mapping,
7673 &exc,
7674 &known_errorHandler, &errorHandler, errors,
7675 &res, &respos)) {
7676 goto onError;
7677 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007678 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007679 else
7680 /* done with this character => adjust input position */
7681 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007682 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007684 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00007685 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007686 if (_PyBytes_Resize(&res, respos) < 0)
7687 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00007688
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007689 Py_XDECREF(exc);
7690 Py_XDECREF(errorHandler);
7691 return res;
7692
Benjamin Peterson29060642009-01-31 22:14:21 +00007693 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007694 Py_XDECREF(res);
7695 Py_XDECREF(exc);
7696 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007697 return NULL;
7698}
7699
Alexander Belopolsky40018472011-02-26 01:02:56 +00007700PyObject *
7701PyUnicode_AsCharmapString(PyObject *unicode,
7702 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007703{
7704 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007705 PyErr_BadArgument();
7706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 }
7708 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 PyUnicode_GET_SIZE(unicode),
7710 mapping,
7711 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007712}
7713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007715static void
7716make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007717 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007718 Py_ssize_t startpos, Py_ssize_t endpos,
7719 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007721 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007722 *exceptionObject = _PyUnicodeTranslateError_Create(
7723 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007724 }
7725 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007726 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7727 goto onError;
7728 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7729 goto onError;
7730 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7731 goto onError;
7732 return;
7733 onError:
7734 Py_DECREF(*exceptionObject);
7735 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007736 }
7737}
7738
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007739/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007740static void
7741raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007742 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007743 Py_ssize_t startpos, Py_ssize_t endpos,
7744 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007745{
7746 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007747 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007748 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007749 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007750}
7751
7752/* error handling callback helper:
7753 build arguments, call the callback and check the arguments,
7754 put the result into newpos and return the replacement string, which
7755 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007756static PyObject *
7757unicode_translate_call_errorhandler(const char *errors,
7758 PyObject **errorHandler,
7759 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007760 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007761 Py_ssize_t startpos, Py_ssize_t endpos,
7762 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007763{
Benjamin Peterson142957c2008-07-04 19:55:29 +00007764 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007765
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007766 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007767 PyObject *restuple;
7768 PyObject *resunicode;
7769
7770 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007772 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007774 }
7775
7776 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007777 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007778 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007780
7781 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00007782 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007783 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007784 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007785 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00007786 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007787 Py_DECREF(restuple);
7788 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007789 }
7790 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00007791 &resunicode, &i_newpos)) {
7792 Py_DECREF(restuple);
7793 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007794 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00007795 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007796 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007797 else
7798 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007799 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007800 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7801 Py_DECREF(restuple);
7802 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007804 Py_INCREF(resunicode);
7805 Py_DECREF(restuple);
7806 return resunicode;
7807}
7808
7809/* Lookup the character ch in the mapping and put the result in result,
7810 which must be decrefed by the caller.
7811 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007812static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007813charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007814{
Christian Heimes217cfd12007-12-02 14:31:20 +00007815 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007816 PyObject *x;
7817
7818 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007819 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007820 x = PyObject_GetItem(mapping, w);
7821 Py_DECREF(w);
7822 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007823 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7824 /* No mapping found means: use 1:1 mapping. */
7825 PyErr_Clear();
7826 *result = NULL;
7827 return 0;
7828 } else
7829 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007830 }
7831 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 *result = x;
7833 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007834 }
Christian Heimes217cfd12007-12-02 14:31:20 +00007835 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007836 long value = PyLong_AS_LONG(x);
7837 long max = PyUnicode_GetMax();
7838 if (value < 0 || value > max) {
7839 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00007840 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 Py_DECREF(x);
7842 return -1;
7843 }
7844 *result = x;
7845 return 0;
7846 }
7847 else if (PyUnicode_Check(x)) {
7848 *result = x;
7849 return 0;
7850 }
7851 else {
7852 /* wrong return value */
7853 PyErr_SetString(PyExc_TypeError,
7854 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007855 Py_DECREF(x);
7856 return -1;
7857 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007858}
7859/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00007860 if not reallocate and adjust various state variables.
7861 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007862static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00007864 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007865{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007866 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00007867 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007868 /* exponentially overallocate to minimize reallocations */
7869 if (requiredsize < 2 * oldsize)
7870 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007871 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7872 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007873 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007874 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007875 }
7876 return 0;
7877}
7878/* lookup the character, put the result in the output string and adjust
7879 various state variables. Return a new reference to the object that
7880 was put in the output buffer in *result, or Py_None, if the mapping was
7881 undefined (in which case no character was written).
7882 The called must decref result.
7883 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007884static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007885charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7886 PyObject *mapping, Py_UCS4 **output,
7887 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00007888 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007889{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7891 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00007892 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007893 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007894 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007895 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007896 }
7897 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007898 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00007899 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007900 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007901 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007902 }
7903 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007904 Py_ssize_t repsize;
7905 if (PyUnicode_READY(*res) == -1)
7906 return -1;
7907 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00007908 if (repsize==1) {
7909 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007910 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00007911 }
7912 else if (repsize!=0) {
7913 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007914 Py_ssize_t requiredsize = *opos +
7915 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00007916 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007917 Py_ssize_t i;
7918 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00007919 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007920 for(i = 0; i < repsize; i++)
7921 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00007922 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007923 }
7924 else
Benjamin Peterson29060642009-01-31 22:14:21 +00007925 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007926 return 0;
7927}
7928
Alexander Belopolsky40018472011-02-26 01:02:56 +00007929PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007930_PyUnicode_TranslateCharmap(PyObject *input,
7931 PyObject *mapping,
7932 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007933{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007934 /* input object */
7935 char *idata;
7936 Py_ssize_t size, i;
7937 int kind;
7938 /* output buffer */
7939 Py_UCS4 *output = NULL;
7940 Py_ssize_t osize;
7941 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007942 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007943 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007944 char *reason = "character maps to <undefined>";
7945 PyObject *errorHandler = NULL;
7946 PyObject *exc = NULL;
7947 /* the following variable is used for caching string comparisons
7948 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7949 * 3=ignore, 4=xmlcharrefreplace */
7950 int known_errorHandler = -1;
7951
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007953 PyErr_BadArgument();
7954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007955 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007957 if (PyUnicode_READY(input) == -1)
7958 return NULL;
7959 idata = (char*)PyUnicode_DATA(input);
7960 kind = PyUnicode_KIND(input);
7961 size = PyUnicode_GET_LENGTH(input);
7962 i = 0;
7963
7964 if (size == 0) {
7965 Py_INCREF(input);
7966 return input;
7967 }
7968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007969 /* allocate enough for a simple 1:1 translation without
7970 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007971 osize = size;
7972 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7973 opos = 0;
7974 if (output == NULL) {
7975 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00007976 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007977 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007979 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007980 /* try to encode it */
7981 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007982 if (charmaptranslate_output(input, i, mapping,
7983 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007984 Py_XDECREF(x);
7985 goto onError;
7986 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007987 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00007988 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007989 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 else { /* untranslatable character */
7991 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7992 Py_ssize_t repsize;
7993 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007994 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00007995 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007996 Py_ssize_t collstart = i;
7997 Py_ssize_t collend = i+1;
7998 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007999
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008001 while (collend < size) {
8002 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 goto onError;
8004 Py_XDECREF(x);
8005 if (x!=Py_None)
8006 break;
8007 ++collend;
8008 }
8009 /* cache callback name lookup
8010 * (if not done yet, i.e. it's the first error) */
8011 if (known_errorHandler==-1) {
8012 if ((errors==NULL) || (!strcmp(errors, "strict")))
8013 known_errorHandler = 1;
8014 else if (!strcmp(errors, "replace"))
8015 known_errorHandler = 2;
8016 else if (!strcmp(errors, "ignore"))
8017 known_errorHandler = 3;
8018 else if (!strcmp(errors, "xmlcharrefreplace"))
8019 known_errorHandler = 4;
8020 else
8021 known_errorHandler = 0;
8022 }
8023 switch (known_errorHandler) {
8024 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008025 raise_translate_exception(&exc, input, collstart,
8026 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008027 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 case 2: /* replace */
8029 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008030 for (coll = collstart; coll<collend; coll++)
8031 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 /* fall through */
8033 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008034 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008035 break;
8036 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008037 /* generate replacement (temporarily (mis)uses i) */
8038 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008039 char buffer[2+29+1+1];
8040 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008041 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8042 if (charmaptranslate_makespace(&output, &osize,
8043 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 goto onError;
8045 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008046 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008048 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 break;
8050 default:
8051 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008052 reason, input, &exc,
8053 collstart, collend, &newpos);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02008054 if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 goto onError;
8056 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008057 repsize = PyUnicode_GET_LENGTH(repunicode);
8058 if (charmaptranslate_makespace(&output, &osize,
8059 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008060 Py_DECREF(repunicode);
8061 goto onError;
8062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008063 for (uni2 = 0; repsize-->0; ++uni2)
8064 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8065 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008067 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008068 }
8069 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008070 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8071 if (!res)
8072 goto onError;
8073 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008074 Py_XDECREF(exc);
8075 Py_XDECREF(errorHandler);
8076 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008077
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008079 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 Py_XDECREF(exc);
8081 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 return NULL;
8083}
8084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008085/* Deprecated. Use PyUnicode_Translate instead. */
8086PyObject *
8087PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8088 Py_ssize_t size,
8089 PyObject *mapping,
8090 const char *errors)
8091{
8092 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8093 if (!unicode)
8094 return NULL;
8095 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8096}
8097
Alexander Belopolsky40018472011-02-26 01:02:56 +00008098PyObject *
8099PyUnicode_Translate(PyObject *str,
8100 PyObject *mapping,
8101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008102{
8103 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008104
Guido van Rossumd57fd912000-03-10 22:53:23 +00008105 str = PyUnicode_FromObject(str);
8106 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008108 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008109 Py_DECREF(str);
8110 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008111
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008113 Py_XDECREF(str);
8114 return NULL;
8115}
Tim Petersced69f82003-09-16 20:30:58 +00008116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008117static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008118fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008119{
8120 /* No need to call PyUnicode_READY(self) because this function is only
8121 called as a callback from fixup() which does it already. */
8122 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8123 const int kind = PyUnicode_KIND(self);
8124 void *data = PyUnicode_DATA(self);
8125 Py_UCS4 maxchar = 0, ch, fixed;
8126 Py_ssize_t i;
8127
8128 for (i = 0; i < len; ++i) {
8129 ch = PyUnicode_READ(kind, data, i);
8130 fixed = 0;
8131 if (ch > 127) {
8132 if (Py_UNICODE_ISSPACE(ch))
8133 fixed = ' ';
8134 else {
8135 const int decimal = Py_UNICODE_TODECIMAL(ch);
8136 if (decimal >= 0)
8137 fixed = '0' + decimal;
8138 }
8139 if (fixed != 0) {
8140 if (fixed > maxchar)
8141 maxchar = fixed;
8142 PyUnicode_WRITE(kind, data, i, fixed);
8143 }
8144 else if (ch > maxchar)
8145 maxchar = ch;
8146 }
8147 else if (ch > maxchar)
8148 maxchar = ch;
8149 }
8150
8151 return maxchar;
8152}
8153
8154PyObject *
8155_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8156{
8157 if (!PyUnicode_Check(unicode)) {
8158 PyErr_BadInternalCall();
8159 return NULL;
8160 }
8161 if (PyUnicode_READY(unicode) == -1)
8162 return NULL;
8163 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8164 /* If the string is already ASCII, just return the same string */
8165 Py_INCREF(unicode);
8166 return unicode;
8167 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008168 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008169}
8170
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008171PyObject *
8172PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8173 Py_ssize_t length)
8174{
8175 PyObject *result;
8176 Py_UNICODE *p; /* write pointer into result */
8177 Py_ssize_t i;
8178 /* Copy to a new string */
8179 result = (PyObject *)_PyUnicode_New(length);
8180 Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8181 if (result == NULL)
8182 return result;
8183 p = PyUnicode_AS_UNICODE(result);
8184 /* Iterate over code points */
8185 for (i = 0; i < length; i++) {
8186 Py_UNICODE ch =s[i];
8187 if (ch > 127) {
8188 int decimal = Py_UNICODE_TODECIMAL(ch);
8189 if (decimal >= 0)
8190 p[i] = '0' + decimal;
8191 }
8192 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008193#ifndef DONT_MAKE_RESULT_READY
8194 if (_PyUnicode_READY_REPLACE(&result)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008195 Py_DECREF(result);
8196 return NULL;
8197 }
Victor Stinner17efeed2011-10-04 20:05:46 +02008198#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008199 assert(_PyUnicode_CheckConsistency(result, 1));
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008200 return result;
8201}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008202/* --- Decimal Encoder ---------------------------------------------------- */
8203
Alexander Belopolsky40018472011-02-26 01:02:56 +00008204int
8205PyUnicode_EncodeDecimal(Py_UNICODE *s,
8206 Py_ssize_t length,
8207 char *output,
8208 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008209{
8210 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008211 PyObject *errorHandler = NULL;
8212 PyObject *exc = NULL;
8213 const char *encoding = "decimal";
8214 const char *reason = "invalid decimal Unicode string";
8215 /* the following variable is used for caching string comparisons
8216 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8217 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008218
8219 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 PyErr_BadArgument();
8221 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008222 }
8223
8224 p = s;
8225 end = s + length;
8226 while (p < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008227 register Py_UNICODE ch = *p;
8228 int decimal;
8229 PyObject *repunicode;
8230 Py_ssize_t repsize;
8231 Py_ssize_t newpos;
8232 Py_UNICODE *uni2;
8233 Py_UNICODE *collstart;
8234 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00008235
Benjamin Peterson29060642009-01-31 22:14:21 +00008236 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008237 *output++ = ' ';
Benjamin Peterson29060642009-01-31 22:14:21 +00008238 ++p;
8239 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 decimal = Py_UNICODE_TODECIMAL(ch);
8242 if (decimal >= 0) {
8243 *output++ = '0' + decimal;
8244 ++p;
8245 continue;
8246 }
8247 if (0 < ch && ch < 256) {
8248 *output++ = (char)ch;
8249 ++p;
8250 continue;
8251 }
8252 /* All other characters are considered unencodable */
8253 collstart = p;
8254 collend = p+1;
8255 while (collend < end) {
8256 if ((0 < *collend && *collend < 256) ||
8257 !Py_UNICODE_ISSPACE(*collend) ||
8258 Py_UNICODE_TODECIMAL(*collend))
8259 break;
8260 }
8261 /* cache callback name lookup
8262 * (if not done yet, i.e. it's the first error) */
8263 if (known_errorHandler==-1) {
8264 if ((errors==NULL) || (!strcmp(errors, "strict")))
8265 known_errorHandler = 1;
8266 else if (!strcmp(errors, "replace"))
8267 known_errorHandler = 2;
8268 else if (!strcmp(errors, "ignore"))
8269 known_errorHandler = 3;
8270 else if (!strcmp(errors, "xmlcharrefreplace"))
8271 known_errorHandler = 4;
8272 else
8273 known_errorHandler = 0;
8274 }
8275 switch (known_errorHandler) {
8276 case 1: /* strict */
8277 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8278 goto onError;
8279 case 2: /* replace */
8280 for (p = collstart; p < collend; ++p)
8281 *output++ = '?';
8282 /* fall through */
8283 case 3: /* ignore */
8284 p = collend;
8285 break;
8286 case 4: /* xmlcharrefreplace */
8287 /* generate replacement (temporarily (mis)uses p) */
8288 for (p = collstart; p < collend; ++p)
8289 output += sprintf(output, "&#%d;", (int)*p);
8290 p = collend;
8291 break;
8292 default:
8293 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8294 encoding, reason, s, length, &exc,
8295 collstart-s, collend-s, &newpos);
8296 if (repunicode == NULL)
8297 goto onError;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008298 if (!PyUnicode_Check(repunicode)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00008299 /* Byte results not supported, since they have no decimal property. */
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008300 PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8301 Py_DECREF(repunicode);
8302 goto onError;
8303 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 /* generate replacement */
8305 repsize = PyUnicode_GET_SIZE(repunicode);
8306 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8307 Py_UNICODE ch = *uni2;
8308 if (Py_UNICODE_ISSPACE(ch))
8309 *output++ = ' ';
8310 else {
8311 decimal = Py_UNICODE_TODECIMAL(ch);
8312 if (decimal >= 0)
8313 *output++ = '0' + decimal;
8314 else if (0 < ch && ch < 256)
8315 *output++ = (char)ch;
8316 else {
8317 Py_DECREF(repunicode);
8318 raise_encode_exception(&exc, encoding,
8319 s, length, collstart-s, collend-s, reason);
8320 goto onError;
8321 }
8322 }
8323 }
8324 p = s + newpos;
8325 Py_DECREF(repunicode);
8326 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00008327 }
8328 /* 0-terminate the output string */
8329 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 Py_XDECREF(exc);
8331 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008332 return 0;
8333
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 Py_XDECREF(exc);
8336 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008337 return -1;
8338}
8339
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340/* --- Helpers ------------------------------------------------------------ */
8341
Victor Stinnerc3cec782011-10-05 21:24:08 +02008342#include "stringlib/asciilib.h"
8343#include "stringlib/fastsearch.h"
8344#include "stringlib/partition.h"
8345#include "stringlib/split.h"
8346#include "stringlib/count.h"
8347#include "stringlib/find.h"
8348#include "stringlib/localeutil.h"
8349#include "stringlib/undef.h"
8350
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351#include "stringlib/ucs1lib.h"
8352#include "stringlib/fastsearch.h"
8353#include "stringlib/partition.h"
8354#include "stringlib/split.h"
8355#include "stringlib/count.h"
8356#include "stringlib/find.h"
8357#include "stringlib/localeutil.h"
8358#include "stringlib/undef.h"
8359
8360#include "stringlib/ucs2lib.h"
8361#include "stringlib/fastsearch.h"
8362#include "stringlib/partition.h"
8363#include "stringlib/split.h"
8364#include "stringlib/count.h"
8365#include "stringlib/find.h"
8366#include "stringlib/localeutil.h"
8367#include "stringlib/undef.h"
8368
8369#include "stringlib/ucs4lib.h"
8370#include "stringlib/fastsearch.h"
8371#include "stringlib/partition.h"
8372#include "stringlib/split.h"
8373#include "stringlib/count.h"
8374#include "stringlib/find.h"
8375#include "stringlib/localeutil.h"
8376#include "stringlib/undef.h"
8377
8378static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008379any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ascii)(const Py_UCS1*, Py_ssize_t,
8380 const Py_UCS1*, Py_ssize_t,
8381 Py_ssize_t, Py_ssize_t),
8382 Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008383 const Py_UCS1*, Py_ssize_t,
8384 Py_ssize_t, Py_ssize_t),
8385 Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8386 const Py_UCS2*, Py_ssize_t,
8387 Py_ssize_t, Py_ssize_t),
8388 Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8389 const Py_UCS4*, Py_ssize_t,
8390 Py_ssize_t, Py_ssize_t),
8391 PyObject* s1, PyObject* s2,
8392 Py_ssize_t start,
8393 Py_ssize_t end)
8394{
8395 int kind1, kind2, kind;
8396 void *buf1, *buf2;
8397 Py_ssize_t len1, len2, result;
8398
8399 kind1 = PyUnicode_KIND(s1);
8400 kind2 = PyUnicode_KIND(s2);
8401 kind = kind1 > kind2 ? kind1 : kind2;
8402 buf1 = PyUnicode_DATA(s1);
8403 buf2 = PyUnicode_DATA(s2);
8404 if (kind1 != kind)
8405 buf1 = _PyUnicode_AsKind(s1, kind);
8406 if (!buf1)
8407 return -2;
8408 if (kind2 != kind)
8409 buf2 = _PyUnicode_AsKind(s2, kind);
8410 if (!buf2) {
8411 if (kind1 != kind) PyMem_Free(buf1);
8412 return -2;
8413 }
8414 len1 = PyUnicode_GET_LENGTH(s1);
8415 len2 = PyUnicode_GET_LENGTH(s2);
8416
8417 switch(kind) {
8418 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008419 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8420 result = ascii(buf1, len1, buf2, len2, start, end);
8421 else
8422 result = ucs1(buf1, len1, buf2, len2, start, end);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008423 break;
8424 case PyUnicode_2BYTE_KIND:
8425 result = ucs2(buf1, len1, buf2, len2, start, end);
8426 break;
8427 case PyUnicode_4BYTE_KIND:
8428 result = ucs4(buf1, len1, buf2, len2, start, end);
8429 break;
8430 default:
8431 assert(0); result = -2;
8432 }
8433
8434 if (kind1 != kind)
8435 PyMem_Free(buf1);
8436 if (kind2 != kind)
8437 PyMem_Free(buf2);
8438
8439 return result;
8440}
8441
8442Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008443_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008444 Py_ssize_t n_buffer,
8445 void *digits, Py_ssize_t n_digits,
8446 Py_ssize_t min_width,
8447 const char *grouping,
8448 const char *thousands_sep)
8449{
8450 switch(kind) {
8451 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008452 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8453 return _PyUnicode_ascii_InsertThousandsGrouping(
8454 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8455 min_width, grouping, thousands_sep);
8456 else
8457 return _PyUnicode_ucs1_InsertThousandsGrouping(
8458 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8459 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008460 case PyUnicode_2BYTE_KIND:
8461 return _PyUnicode_ucs2_InsertThousandsGrouping(
8462 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8463 min_width, grouping, thousands_sep);
8464 case PyUnicode_4BYTE_KIND:
8465 return _PyUnicode_ucs4_InsertThousandsGrouping(
8466 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8467 min_width, grouping, thousands_sep);
8468 }
8469 assert(0);
8470 return -1;
8471}
8472
8473
Eric Smith8c663262007-08-25 02:26:07 +00008474#include "stringlib/unicodedefs.h"
Thomas Wouters477c8d52006-05-27 19:21:47 +00008475#include "stringlib/fastsearch.h"
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008476
Thomas Wouters477c8d52006-05-27 19:21:47 +00008477#include "stringlib/count.h"
8478#include "stringlib/find.h"
Eric Smith5807c412008-05-11 21:00:57 +00008479
Thomas Wouters477c8d52006-05-27 19:21:47 +00008480/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008481#define ADJUST_INDICES(start, end, len) \
8482 if (end > len) \
8483 end = len; \
8484 else if (end < 0) { \
8485 end += len; \
8486 if (end < 0) \
8487 end = 0; \
8488 } \
8489 if (start < 0) { \
8490 start += len; \
8491 if (start < 0) \
8492 start = 0; \
8493 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008494
Alexander Belopolsky40018472011-02-26 01:02:56 +00008495Py_ssize_t
8496PyUnicode_Count(PyObject *str,
8497 PyObject *substr,
8498 Py_ssize_t start,
8499 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008500{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008501 Py_ssize_t result;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008502 PyUnicodeObject* str_obj;
8503 PyUnicodeObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008504 int kind1, kind2, kind;
8505 void *buf1 = NULL, *buf2 = NULL;
8506 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00008507
Thomas Wouters477c8d52006-05-27 19:21:47 +00008508 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008509 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008511 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02008512 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 Py_DECREF(str_obj);
8514 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008515 }
Tim Petersced69f82003-09-16 20:30:58 +00008516
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 kind1 = PyUnicode_KIND(str_obj);
8518 kind2 = PyUnicode_KIND(sub_obj);
8519 kind = kind1 > kind2 ? kind1 : kind2;
8520 buf1 = PyUnicode_DATA(str_obj);
8521 if (kind1 != kind)
8522 buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8523 if (!buf1)
8524 goto onError;
8525 buf2 = PyUnicode_DATA(sub_obj);
8526 if (kind2 != kind)
8527 buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8528 if (!buf2)
8529 goto onError;
8530 len1 = PyUnicode_GET_LENGTH(str_obj);
8531 len2 = PyUnicode_GET_LENGTH(sub_obj);
8532
8533 ADJUST_INDICES(start, end, len1);
8534 switch(kind) {
8535 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008536 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8537 result = asciilib_count(
8538 ((Py_UCS1*)buf1) + start, end - start,
8539 buf2, len2, PY_SSIZE_T_MAX
8540 );
8541 else
8542 result = ucs1lib_count(
8543 ((Py_UCS1*)buf1) + start, end - start,
8544 buf2, len2, PY_SSIZE_T_MAX
8545 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 break;
8547 case PyUnicode_2BYTE_KIND:
8548 result = ucs2lib_count(
8549 ((Py_UCS2*)buf1) + start, end - start,
8550 buf2, len2, PY_SSIZE_T_MAX
8551 );
8552 break;
8553 case PyUnicode_4BYTE_KIND:
8554 result = ucs4lib_count(
8555 ((Py_UCS4*)buf1) + start, end - start,
8556 buf2, len2, PY_SSIZE_T_MAX
8557 );
8558 break;
8559 default:
8560 assert(0); result = 0;
8561 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008562
8563 Py_DECREF(sub_obj);
8564 Py_DECREF(str_obj);
8565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008566 if (kind1 != kind)
8567 PyMem_Free(buf1);
8568 if (kind2 != kind)
8569 PyMem_Free(buf2);
8570
Guido van Rossumd57fd912000-03-10 22:53:23 +00008571 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008572 onError:
8573 Py_DECREF(sub_obj);
8574 Py_DECREF(str_obj);
8575 if (kind1 != kind && buf1)
8576 PyMem_Free(buf1);
8577 if (kind2 != kind && buf2)
8578 PyMem_Free(buf2);
8579 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580}
8581
Alexander Belopolsky40018472011-02-26 01:02:56 +00008582Py_ssize_t
8583PyUnicode_Find(PyObject *str,
8584 PyObject *sub,
8585 Py_ssize_t start,
8586 Py_ssize_t end,
8587 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008589 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008590
Guido van Rossumd57fd912000-03-10 22:53:23 +00008591 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008593 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00008594 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008595 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 Py_DECREF(str);
8597 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598 }
Tim Petersced69f82003-09-16 20:30:58 +00008599
Thomas Wouters477c8d52006-05-27 19:21:47 +00008600 if (direction > 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008601 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008602 asciilib_find_slice, ucs1lib_find_slice,
8603 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008604 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008605 );
8606 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008607 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +02008608 asciilib_find_slice, ucs1lib_rfind_slice,
8609 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 str, sub, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +00008611 );
8612
Guido van Rossumd57fd912000-03-10 22:53:23 +00008613 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00008614 Py_DECREF(sub);
8615
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 return result;
8617}
8618
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008619Py_ssize_t
8620PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8621 Py_ssize_t start, Py_ssize_t end,
8622 int direction)
8623{
8624 char *result;
8625 int kind;
8626 if (PyUnicode_READY(str) == -1)
8627 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02008628 if (start < 0 || end < 0) {
8629 PyErr_SetString(PyExc_IndexError, "string index out of range");
8630 return -2;
8631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 if (end > PyUnicode_GET_LENGTH(str))
8633 end = PyUnicode_GET_LENGTH(str);
8634 kind = PyUnicode_KIND(str);
8635 result = findchar(PyUnicode_1BYTE_DATA(str)
8636 + PyUnicode_KIND_SIZE(kind, start),
8637 kind,
8638 end-start, ch, direction);
8639 if (!result)
8640 return -1;
8641 return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8642}
8643
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644static int
8645tailmatch(PyUnicodeObject *self,
8646 PyUnicodeObject *substring,
8647 Py_ssize_t start,
8648 Py_ssize_t end,
8649 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 int kind_self;
8652 int kind_sub;
8653 void *data_self;
8654 void *data_sub;
8655 Py_ssize_t offset;
8656 Py_ssize_t i;
8657 Py_ssize_t end_sub;
8658
8659 if (PyUnicode_READY(self) == -1 ||
8660 PyUnicode_READY(substring) == -1)
8661 return 0;
8662
8663 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664 return 1;
8665
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8667 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00008669 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008670
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 kind_self = PyUnicode_KIND(self);
8672 data_self = PyUnicode_DATA(self);
8673 kind_sub = PyUnicode_KIND(substring);
8674 data_sub = PyUnicode_DATA(substring);
8675 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8676
8677 if (direction > 0)
8678 offset = end;
8679 else
8680 offset = start;
8681
8682 if (PyUnicode_READ(kind_self, data_self, offset) ==
8683 PyUnicode_READ(kind_sub, data_sub, 0) &&
8684 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8685 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8686 /* If both are of the same kind, memcmp is sufficient */
8687 if (kind_self == kind_sub) {
8688 return ! memcmp((char *)data_self +
8689 (offset * PyUnicode_CHARACTER_SIZE(substring)),
8690 data_sub,
8691 PyUnicode_GET_LENGTH(substring) *
8692 PyUnicode_CHARACTER_SIZE(substring));
8693 }
8694 /* otherwise we have to compare each character by first accesing it */
8695 else {
8696 /* We do not need to compare 0 and len(substring)-1 because
8697 the if statement above ensured already that they are equal
8698 when we end up here. */
8699 // TODO: honor direction and do a forward or backwards search
8700 for (i = 1; i < end_sub; ++i) {
8701 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8702 PyUnicode_READ(kind_sub, data_sub, i))
8703 return 0;
8704 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008707 }
8708
8709 return 0;
8710}
8711
Alexander Belopolsky40018472011-02-26 01:02:56 +00008712Py_ssize_t
8713PyUnicode_Tailmatch(PyObject *str,
8714 PyObject *substr,
8715 Py_ssize_t start,
8716 Py_ssize_t end,
8717 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008718{
Martin v. Löwis18e16552006-02-15 17:27:45 +00008719 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00008720
Guido van Rossumd57fd912000-03-10 22:53:23 +00008721 str = PyUnicode_FromObject(str);
8722 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724 substr = PyUnicode_FromObject(substr);
8725 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 Py_DECREF(str);
8727 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008728 }
Tim Petersced69f82003-09-16 20:30:58 +00008729
Guido van Rossumd57fd912000-03-10 22:53:23 +00008730 result = tailmatch((PyUnicodeObject *)str,
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 (PyUnicodeObject *)substr,
8732 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733 Py_DECREF(str);
8734 Py_DECREF(substr);
8735 return result;
8736}
8737
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738/* Apply fixfct filter to the Unicode object self and return a
8739 reference to the modified object */
8740
Alexander Belopolsky40018472011-02-26 01:02:56 +00008741static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02008742fixup(PyObject *self,
8743 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 PyObject *u;
8746 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008747
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748 if (PyUnicode_READY(self) == -1)
8749 return NULL;
8750 maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8751 u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8752 maxchar_old);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008753 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008756 Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8757 PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00008758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008759 /* fix functions return the new maximum character in a string,
8760 if the kind of the resulting unicode object does not change,
8761 everything is fine. Otherwise we need to change the string kind
8762 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02008763 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764 if (maxchar_new == 0)
8765 /* do nothing, keep maxchar_new at 0 which means no changes. */;
8766 else if (maxchar_new <= 127)
8767 maxchar_new = 127;
8768 else if (maxchar_new <= 255)
8769 maxchar_new = 255;
8770 else if (maxchar_new <= 65535)
8771 maxchar_new = 65535;
8772 else
8773 maxchar_new = 1114111; /* 0x10ffff */
8774
8775 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 /* fixfct should return TRUE if it modified the buffer. If
8777 FALSE, return a reference to the original buffer instead
8778 (to save space, not time) */
8779 Py_INCREF(self);
8780 Py_DECREF(u);
8781 return (PyObject*) self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008783 else if (maxchar_new == maxchar_old) {
8784 return u;
8785 }
8786 else {
8787 /* In case the maximum character changed, we need to
8788 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008789 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008790 if (v == NULL) {
8791 Py_DECREF(u);
8792 return NULL;
8793 }
8794 if (maxchar_new > maxchar_old) {
8795 /* If the maxchar increased so that the kind changed, not all
8796 characters are representable anymore and we need to fix the
8797 string again. This only happens in very few cases. */
Victor Stinner157f83f2011-09-28 21:41:31 +02008798 if (PyUnicode_CopyCharacters(v, 0,
8799 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008800 PyUnicode_GET_LENGTH(self)) < 0)
8801 {
8802 Py_DECREF(u);
8803 return NULL;
8804 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008805 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8807 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008808 else {
Victor Stinner157f83f2011-09-28 21:41:31 +02008809 if (PyUnicode_CopyCharacters(v, 0,
8810 u, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02008811 PyUnicode_GET_LENGTH(self)) < 0)
8812 {
8813 Py_DECREF(u);
8814 return NULL;
8815 }
8816 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008817
8818 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02008819 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 return v;
8821 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008822}
8823
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008824static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008825fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008826{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008827 /* No need to call PyUnicode_READY(self) because this function is only
8828 called as a callback from fixup() which does it already. */
8829 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8830 const int kind = PyUnicode_KIND(self);
8831 void *data = PyUnicode_DATA(self);
8832 int touched = 0;
8833 Py_UCS4 maxchar = 0;
8834 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 for (i = 0; i < len; ++i) {
8837 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8838 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8839 if (up != ch) {
8840 if (up > maxchar)
8841 maxchar = up;
8842 PyUnicode_WRITE(kind, data, i, up);
8843 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845 else if (ch > maxchar)
8846 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008847 }
8848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 if (touched)
8850 return maxchar;
8851 else
8852 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008853}
8854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008855static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008856fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008858 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8859 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8860 const int kind = PyUnicode_KIND(self);
8861 void *data = PyUnicode_DATA(self);
8862 int touched = 0;
8863 Py_UCS4 maxchar = 0;
8864 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008865
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008866 for(i = 0; i < len; ++i) {
8867 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8868 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8869 if (lo != ch) {
8870 if (lo > maxchar)
8871 maxchar = lo;
8872 PyUnicode_WRITE(kind, data, i, lo);
8873 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008875 else if (ch > maxchar)
8876 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008877 }
8878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008879 if (touched)
8880 return maxchar;
8881 else
8882 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008883}
8884
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008885static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008886fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008887{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8889 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8890 const int kind = PyUnicode_KIND(self);
8891 void *data = PyUnicode_DATA(self);
8892 int touched = 0;
8893 Py_UCS4 maxchar = 0;
8894 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00008895
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008896 for(i = 0; i < len; ++i) {
8897 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8898 Py_UCS4 nu = 0;
8899
8900 if (Py_UNICODE_ISUPPER(ch))
8901 nu = Py_UNICODE_TOLOWER(ch);
8902 else if (Py_UNICODE_ISLOWER(ch))
8903 nu = Py_UNICODE_TOUPPER(ch);
8904
8905 if (nu != 0) {
8906 if (nu > maxchar)
8907 maxchar = nu;
8908 PyUnicode_WRITE(kind, data, i, nu);
8909 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008910 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008911 else if (ch > maxchar)
8912 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913 }
8914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008915 if (touched)
8916 return maxchar;
8917 else
8918 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008919}
8920
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008921static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008922fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008923{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008924 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8925 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8926 const int kind = PyUnicode_KIND(self);
8927 void *data = PyUnicode_DATA(self);
8928 int touched = 0;
8929 Py_UCS4 maxchar = 0;
8930 Py_ssize_t i = 0;
8931 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00008932
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008933 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008935
8936 ch = PyUnicode_READ(kind, data, i);
8937 if (!Py_UNICODE_ISUPPER(ch)) {
8938 maxchar = Py_UNICODE_TOUPPER(ch);
8939 PyUnicode_WRITE(kind, data, i, maxchar);
8940 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008941 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008942 ++i;
8943 for(; i < len; ++i) {
8944 ch = PyUnicode_READ(kind, data, i);
8945 if (!Py_UNICODE_ISLOWER(ch)) {
8946 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8947 if (lo > maxchar)
8948 maxchar = lo;
8949 PyUnicode_WRITE(kind, data, i, lo);
8950 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 else if (ch > maxchar)
8953 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00008954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955
8956 if (touched)
8957 return maxchar;
8958 else
8959 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960}
8961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008962static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008963fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008965 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8966 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8967 const int kind = PyUnicode_KIND(self);
8968 void *data = PyUnicode_DATA(self);
8969 Py_UCS4 maxchar = 0;
8970 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971 int previous_is_cased;
8972
8973 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 if (len == 1) {
8975 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8976 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8977 if (ti != ch) {
8978 PyUnicode_WRITE(kind, data, i, ti);
8979 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 }
8981 else
8982 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008984 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 for(; i < len; ++i) {
8986 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8987 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00008988
Benjamin Peterson29060642009-01-31 22:14:21 +00008989 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00008991 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008992 nu = Py_UNICODE_TOTITLE(ch);
8993
8994 if (nu > maxchar)
8995 maxchar = nu;
8996 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00008997
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 if (Py_UNICODE_ISLOWER(ch) ||
8999 Py_UNICODE_ISUPPER(ch) ||
9000 Py_UNICODE_ISTITLE(ch))
9001 previous_is_cased = 1;
9002 else
9003 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009005 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006}
9007
Tim Peters8ce9f162004-08-27 01:49:32 +00009008PyObject *
9009PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 PyObject *sep = NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009012 Py_ssize_t seplen = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009013 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009014 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009015 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9016 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009017 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009018 Py_ssize_t sz, i, res_offset;
9019 Py_UCS4 maxchar = 0;
9020 Py_UCS4 item_maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009021
Tim Peters05eba1f2004-08-27 21:32:02 +00009022 fseq = PySequence_Fast(seq, "");
9023 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009024 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009025 }
9026
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009027 /* NOTE: the following code can't call back into Python code,
9028 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009029 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009030
Tim Peters05eba1f2004-08-27 21:32:02 +00009031 seqlen = PySequence_Fast_GET_SIZE(fseq);
9032 /* If empty sequence, return u"". */
9033 if (seqlen == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009034 res = PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009035 goto Done;
Tim Peters05eba1f2004-08-27 21:32:02 +00009036 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009037 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009038 /* If singleton sequence with an exact Unicode, return that. */
9039 if (seqlen == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 item = items[0];
9041 if (PyUnicode_CheckExact(item)) {
9042 Py_INCREF(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 res = item;
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 goto Done;
9045 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009046 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009047 else {
9048 /* Set up sep and seplen */
9049 if (separator == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009050 /* fall back to a blank space separator */
9051 sep = PyUnicode_FromOrdinal(' ');
Victor Stinnere9a29352011-10-01 02:14:59 +02009052 if (!sep)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00009054 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009055 else {
9056 if (!PyUnicode_Check(separator)) {
9057 PyErr_Format(PyExc_TypeError,
9058 "separator: expected str instance,"
9059 " %.80s found",
9060 Py_TYPE(separator)->tp_name);
9061 goto onError;
9062 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +02009063 if (PyUnicode_READY(separator))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 goto onError;
9065 sep = separator;
9066 seplen = PyUnicode_GET_LENGTH(separator);
9067 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
Georg Brandl7597add2011-10-05 16:36:47 +02009068 /* inc refcount to keep this code path symmetric with the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069 above case of a blank separator */
9070 Py_INCREF(sep);
Tim Peters05eba1f2004-08-27 21:32:02 +00009071 }
9072 }
9073
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009074 /* There are at least two things to join, or else we have a subclass
9075 * of str in the sequence.
9076 * Do a pre-pass to figure out the total amount of space we'll
9077 * need (sz), and see whether all argument are strings.
9078 */
9079 sz = 0;
9080 for (i = 0; i < seqlen; i++) {
9081 const Py_ssize_t old_sz = sz;
9082 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009083 if (!PyUnicode_Check(item)) {
9084 PyErr_Format(PyExc_TypeError,
9085 "sequence item %zd: expected str instance,"
9086 " %.80s found",
9087 i, Py_TYPE(item)->tp_name);
9088 goto onError;
9089 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 if (PyUnicode_READY(item) == -1)
9091 goto onError;
9092 sz += PyUnicode_GET_LENGTH(item);
9093 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9094 if (item_maxchar > maxchar)
9095 maxchar = item_maxchar;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009096 if (i != 0)
9097 sz += seplen;
9098 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9099 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009100 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009101 goto onError;
9102 }
9103 }
Tim Petersced69f82003-09-16 20:30:58 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009106 if (res == NULL)
9107 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009108
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009109 /* Catenate everything. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinner9ce5a832011-10-03 23:36:02 +02009111 Py_ssize_t itemlen, copied;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009112 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009113 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009114 if (i && seplen != 0) {
9115 copied = PyUnicode_CopyCharacters(res, res_offset,
9116 sep, 0, seplen);
9117 if (copied < 0)
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009118 goto onError;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009119#ifdef Py_DEBUG
9120 res_offset += copied;
9121#else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009122 res_offset += seplen;
Victor Stinner9ce5a832011-10-03 23:36:02 +02009123#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00009124 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009125 itemlen = PyUnicode_GET_LENGTH(item);
9126 if (itemlen != 0) {
9127 copied = PyUnicode_CopyCharacters(res, res_offset,
9128 item, 0, itemlen);
9129 if (copied < 0)
9130 goto onError;
9131#ifdef Py_DEBUG
9132 res_offset += copied;
9133#else
9134 res_offset += itemlen;
9135#endif
9136 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009137 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009139
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 Done:
Tim Peters05eba1f2004-08-27 21:32:02 +00009141 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009143 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009145
Benjamin Peterson29060642009-01-31 22:14:21 +00009146 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009147 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009149 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150 return NULL;
9151}
9152
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153#define FILL(kind, data, value, start, length) \
9154 do { \
9155 Py_ssize_t i_ = 0; \
9156 assert(kind != PyUnicode_WCHAR_KIND); \
9157 switch ((kind)) { \
9158 case PyUnicode_1BYTE_KIND: { \
9159 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9160 memset(to_, (unsigned char)value, length); \
9161 break; \
9162 } \
9163 case PyUnicode_2BYTE_KIND: { \
9164 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9165 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9166 break; \
9167 } \
9168 default: { \
9169 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9170 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9171 break; \
9172 } \
9173 } \
9174 } while (0)
9175
Victor Stinner9310abb2011-10-05 00:59:23 +02009176static PyObject *
9177pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009178 Py_ssize_t left,
9179 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009181{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009182 PyObject *u;
9183 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009184 int kind;
9185 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009186
9187 if (left < 0)
9188 left = 0;
9189 if (right < 0)
9190 right = 0;
9191
Tim Peters7a29bd52001-09-12 03:03:31 +00009192 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009193 Py_INCREF(self);
9194 return self;
9195 }
9196
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009197 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9198 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009199 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9200 return NULL;
9201 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9203 if (fill > maxchar)
9204 maxchar = fill;
9205 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009206 if (!u)
9207 return NULL;
9208
9209 kind = PyUnicode_KIND(u);
9210 data = PyUnicode_DATA(u);
9211 if (left)
9212 FILL(kind, data, fill, 0, left);
9213 if (right)
9214 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinner157f83f2011-09-28 21:41:31 +02009215 if (PyUnicode_CopyCharacters(u, left,
9216 (PyObject*)self, 0,
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009217 _PyUnicode_LENGTH(self)) < 0)
9218 {
9219 Py_DECREF(u);
9220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009221 }
9222
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009223 assert(_PyUnicode_CheckConsistency(u, 1));
9224 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227
Alexander Belopolsky40018472011-02-26 01:02:56 +00009228PyObject *
9229PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009231 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232
9233 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009237 switch(PyUnicode_KIND(string)) {
9238 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009239 if (PyUnicode_IS_ASCII(string))
9240 list = asciilib_splitlines(
9241 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9242 PyUnicode_GET_LENGTH(string), keepends);
9243 else
9244 list = ucs1lib_splitlines(
9245 (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9246 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 break;
9248 case PyUnicode_2BYTE_KIND:
9249 list = ucs2lib_splitlines(
9250 (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9251 PyUnicode_GET_LENGTH(string), keepends);
9252 break;
9253 case PyUnicode_4BYTE_KIND:
9254 list = ucs4lib_splitlines(
9255 (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9256 PyUnicode_GET_LENGTH(string), keepends);
9257 break;
9258 default:
9259 assert(0);
9260 list = 0;
9261 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 Py_DECREF(string);
9263 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264}
9265
Alexander Belopolsky40018472011-02-26 01:02:56 +00009266static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009267split(PyObject *self,
9268 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009269 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009270{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 int kind1, kind2, kind;
9272 void *buf1, *buf2;
9273 Py_ssize_t len1, len2;
9274 PyObject* out;
9275
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009277 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 if (PyUnicode_READY(self) == -1)
9280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 if (substring == NULL)
9283 switch(PyUnicode_KIND(self)) {
9284 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009285 if (PyUnicode_IS_ASCII(self))
9286 return asciilib_split_whitespace(
9287 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9288 PyUnicode_GET_LENGTH(self), maxcount
9289 );
9290 else
9291 return ucs1lib_split_whitespace(
9292 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9293 PyUnicode_GET_LENGTH(self), maxcount
9294 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295 case PyUnicode_2BYTE_KIND:
9296 return ucs2lib_split_whitespace(
9297 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9298 PyUnicode_GET_LENGTH(self), maxcount
9299 );
9300 case PyUnicode_4BYTE_KIND:
9301 return ucs4lib_split_whitespace(
9302 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9303 PyUnicode_GET_LENGTH(self), maxcount
9304 );
9305 default:
9306 assert(0);
9307 return NULL;
9308 }
9309
9310 if (PyUnicode_READY(substring) == -1)
9311 return NULL;
9312
9313 kind1 = PyUnicode_KIND(self);
9314 kind2 = PyUnicode_KIND(substring);
9315 kind = kind1 > kind2 ? kind1 : kind2;
9316 buf1 = PyUnicode_DATA(self);
9317 buf2 = PyUnicode_DATA(substring);
9318 if (kind1 != kind)
9319 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9320 if (!buf1)
9321 return NULL;
9322 if (kind2 != kind)
9323 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9324 if (!buf2) {
9325 if (kind1 != kind) PyMem_Free(buf1);
9326 return NULL;
9327 }
9328 len1 = PyUnicode_GET_LENGTH(self);
9329 len2 = PyUnicode_GET_LENGTH(substring);
9330
9331 switch(kind) {
9332 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009333 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9334 out = asciilib_split(
9335 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9336 else
9337 out = ucs1lib_split(
9338 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009339 break;
9340 case PyUnicode_2BYTE_KIND:
9341 out = ucs2lib_split(
9342 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9343 break;
9344 case PyUnicode_4BYTE_KIND:
9345 out = ucs4lib_split(
9346 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9347 break;
9348 default:
9349 out = NULL;
9350 }
9351 if (kind1 != kind)
9352 PyMem_Free(buf1);
9353 if (kind2 != kind)
9354 PyMem_Free(buf2);
9355 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009356}
9357
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009359rsplit(PyObject *self,
9360 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009361 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009362{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009363 int kind1, kind2, kind;
9364 void *buf1, *buf2;
9365 Py_ssize_t len1, len2;
9366 PyObject* out;
9367
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009368 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009369 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009371 if (PyUnicode_READY(self) == -1)
9372 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 if (substring == NULL)
9375 switch(PyUnicode_KIND(self)) {
9376 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009377 if (PyUnicode_IS_ASCII(self))
9378 return asciilib_rsplit_whitespace(
9379 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9380 PyUnicode_GET_LENGTH(self), maxcount
9381 );
9382 else
9383 return ucs1lib_rsplit_whitespace(
9384 (PyObject*) self, PyUnicode_1BYTE_DATA(self),
9385 PyUnicode_GET_LENGTH(self), maxcount
9386 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 case PyUnicode_2BYTE_KIND:
9388 return ucs2lib_rsplit_whitespace(
9389 (PyObject*) self, PyUnicode_2BYTE_DATA(self),
9390 PyUnicode_GET_LENGTH(self), maxcount
9391 );
9392 case PyUnicode_4BYTE_KIND:
9393 return ucs4lib_rsplit_whitespace(
9394 (PyObject*) self, PyUnicode_4BYTE_DATA(self),
9395 PyUnicode_GET_LENGTH(self), maxcount
9396 );
9397 default:
9398 assert(0);
9399 return NULL;
9400 }
9401
9402 if (PyUnicode_READY(substring) == -1)
9403 return NULL;
9404
9405 kind1 = PyUnicode_KIND(self);
9406 kind2 = PyUnicode_KIND(substring);
9407 kind = kind1 > kind2 ? kind1 : kind2;
9408 buf1 = PyUnicode_DATA(self);
9409 buf2 = PyUnicode_DATA(substring);
9410 if (kind1 != kind)
9411 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9412 if (!buf1)
9413 return NULL;
9414 if (kind2 != kind)
9415 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9416 if (!buf2) {
9417 if (kind1 != kind) PyMem_Free(buf1);
9418 return NULL;
9419 }
9420 len1 = PyUnicode_GET_LENGTH(self);
9421 len2 = PyUnicode_GET_LENGTH(substring);
9422
9423 switch(kind) {
9424 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009425 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9426 out = asciilib_rsplit(
9427 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9428 else
9429 out = ucs1lib_rsplit(
9430 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 break;
9432 case PyUnicode_2BYTE_KIND:
9433 out = ucs2lib_rsplit(
9434 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9435 break;
9436 case PyUnicode_4BYTE_KIND:
9437 out = ucs4lib_rsplit(
9438 (PyObject*) self, buf1, len1, buf2, len2, maxcount);
9439 break;
9440 default:
9441 out = NULL;
9442 }
9443 if (kind1 != kind)
9444 PyMem_Free(buf1);
9445 if (kind2 != kind)
9446 PyMem_Free(buf2);
9447 return out;
9448}
9449
9450static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009451anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9452 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009453{
9454 switch(kind) {
9455 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009456 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9457 return asciilib_find(buf1, len1, buf2, len2, offset);
9458 else
9459 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 case PyUnicode_2BYTE_KIND:
9461 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9462 case PyUnicode_4BYTE_KIND:
9463 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9464 }
9465 assert(0);
9466 return -1;
9467}
9468
9469static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009470anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9471 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009472{
9473 switch(kind) {
9474 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009475 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9476 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9477 else
9478 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009479 case PyUnicode_2BYTE_KIND:
9480 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9481 case PyUnicode_4BYTE_KIND:
9482 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9483 }
9484 assert(0);
9485 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009486}
9487
Alexander Belopolsky40018472011-02-26 01:02:56 +00009488static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489replace(PyObject *self, PyObject *str1,
9490 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009492 PyObject *u;
9493 char *sbuf = PyUnicode_DATA(self);
9494 char *buf1 = PyUnicode_DATA(str1);
9495 char *buf2 = PyUnicode_DATA(str2);
9496 int srelease = 0, release1 = 0, release2 = 0;
9497 int skind = PyUnicode_KIND(self);
9498 int kind1 = PyUnicode_KIND(str1);
9499 int kind2 = PyUnicode_KIND(str2);
9500 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9501 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9502 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503
9504 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009505 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009507 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 if (skind < kind1)
9510 /* substring too wide to be present */
9511 goto nothing;
9512
9513 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +00009514 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009515 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009516 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009517 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009518 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009519 /* replace characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009520 Py_UCS4 u1, u2, maxchar;
9521 int mayshrink, rkind;
9522 u1 = PyUnicode_READ_CHAR(str1, 0);
9523 if (!findchar(sbuf, PyUnicode_KIND(self),
9524 slen, u1, 1))
Thomas Wouters477c8d52006-05-27 19:21:47 +00009525 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 u2 = PyUnicode_READ_CHAR(str2, 0);
9527 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9528 /* Replacing u1 with u2 may cause a maxchar reduction in the
9529 result string. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 if (u2 > maxchar) {
9531 maxchar = u2;
9532 mayshrink = 0;
9533 }
Victor Stinnerb9275c12011-10-05 14:01:42 +02009534 else
9535 mayshrink = maxchar > 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009537 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009538 goto error;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009539 if (PyUnicode_CopyCharacters(u, 0,
9540 (PyObject*)self, 0, slen) < 0)
9541 {
9542 Py_DECREF(u);
9543 return NULL;
9544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 rkind = PyUnicode_KIND(u);
9546 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9547 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009548 if (--maxcount < 0)
9549 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009550 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009551 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 if (mayshrink) {
9553 PyObject *tmp = u;
9554 u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9555 PyUnicode_GET_LENGTH(tmp));
9556 Py_DECREF(tmp);
9557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 int rkind = skind;
9560 char *res;
9561 if (kind1 < rkind) {
9562 /* widen substring */
9563 buf1 = _PyUnicode_AsKind(str1, rkind);
9564 if (!buf1) goto error;
9565 release1 = 1;
9566 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009567 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009568 if (i < 0)
9569 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009570 if (rkind > kind2) {
9571 /* widen replacement */
9572 buf2 = _PyUnicode_AsKind(str2, rkind);
9573 if (!buf2) goto error;
9574 release2 = 1;
9575 }
9576 else if (rkind < kind2) {
9577 /* widen self and buf1 */
9578 rkind = kind2;
9579 if (release1) PyMem_Free(buf1);
9580 sbuf = _PyUnicode_AsKind(self, rkind);
9581 if (!sbuf) goto error;
9582 srelease = 1;
9583 buf1 = _PyUnicode_AsKind(str1, rkind);
9584 if (!buf1) goto error;
9585 release1 = 1;
9586 }
9587 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9588 if (!res) {
9589 PyErr_NoMemory();
9590 goto error;
9591 }
9592 memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009593 /* change everything in-place, starting with this one */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9595 buf2,
9596 PyUnicode_KIND_SIZE(rkind, len2));
9597 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009598
9599 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +02009600 i = anylib_find(rkind, self,
9601 sbuf+PyUnicode_KIND_SIZE(rkind, i), slen-i,
9602 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009603 if (i == -1)
9604 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9606 buf2,
9607 PyUnicode_KIND_SIZE(rkind, len2));
9608 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610
9611 u = PyUnicode_FromKindAndData(rkind, res, slen);
9612 PyMem_Free(res);
9613 if (!u) goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009615 } else {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 Py_ssize_t n, i, j, ires;
9618 Py_ssize_t product, new_size;
9619 int rkind = skind;
9620 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 if (kind1 < rkind) {
9623 buf1 = _PyUnicode_AsKind(str1, rkind);
9624 if (!buf1) goto error;
9625 release1 = 1;
9626 }
Victor Stinnerc3cec782011-10-05 21:24:08 +02009627 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009628 if (n == 0)
9629 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 if (kind2 < rkind) {
9631 buf2 = _PyUnicode_AsKind(str2, rkind);
9632 if (!buf2) goto error;
9633 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009635 else if (kind2 > rkind) {
9636 rkind = kind2;
9637 sbuf = _PyUnicode_AsKind(self, rkind);
9638 if (!sbuf) goto error;
9639 srelease = 1;
9640 if (release1) PyMem_Free(buf1);
9641 buf1 = _PyUnicode_AsKind(str1, rkind);
9642 if (!buf1) goto error;
9643 release1 = 1;
9644 }
9645 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9646 PyUnicode_GET_LENGTH(str1))); */
9647 product = n * (len2-len1);
9648 if ((product / (len2-len1)) != n) {
9649 PyErr_SetString(PyExc_OverflowError,
9650 "replace string is too long");
9651 goto error;
9652 }
9653 new_size = slen + product;
9654 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9655 PyErr_SetString(PyExc_OverflowError,
9656 "replace string is too long");
9657 goto error;
9658 }
9659 res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9660 if (!res)
9661 goto error;
9662 ires = i = 0;
9663 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009664 while (n-- > 0) {
9665 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +02009666 j = anylib_find(rkind, self,
9667 sbuf + PyUnicode_KIND_SIZE(rkind, i), slen-i,
9668 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009669 if (j == -1)
9670 break;
9671 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +00009672 /* copy unchanged part [i:j] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009673 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9674 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9675 PyUnicode_KIND_SIZE(rkind, j-i));
9676 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009677 }
9678 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009679 if (len2 > 0) {
9680 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9681 buf2,
9682 PyUnicode_KIND_SIZE(rkind, len2));
9683 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009686 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +00009688 /* copy tail [i:] */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9690 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9691 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009692 } else {
9693 /* interleave */
9694 while (n > 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9696 buf2,
9697 PyUnicode_KIND_SIZE(rkind, len2));
9698 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009699 if (--n <= 0)
9700 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9702 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9703 PyUnicode_KIND_SIZE(rkind, 1));
9704 ires++;
9705 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9708 sbuf + PyUnicode_KIND_SIZE(rkind, i),
9709 PyUnicode_KIND_SIZE(rkind, slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +00009710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 u = PyUnicode_FromKindAndData(rkind, res, new_size);
Martin v. Löwis0b1d3482011-10-01 16:35:40 +02009712 PyMem_Free(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009714 if (srelease)
9715 PyMem_FREE(sbuf);
9716 if (release1)
9717 PyMem_FREE(buf1);
9718 if (release2)
9719 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009720 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009721 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009722
Benjamin Peterson29060642009-01-31 22:14:21 +00009723 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +00009724 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725 if (srelease)
9726 PyMem_FREE(sbuf);
9727 if (release1)
9728 PyMem_FREE(buf1);
9729 if (release2)
9730 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009731 if (PyUnicode_CheckExact(self)) {
9732 Py_INCREF(self);
9733 return (PyObject *) self;
9734 }
Victor Stinner034f6cf2011-09-30 02:26:44 +02009735 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 error:
9737 if (srelease && sbuf)
9738 PyMem_FREE(sbuf);
9739 if (release1 && buf1)
9740 PyMem_FREE(buf1);
9741 if (release2 && buf2)
9742 PyMem_FREE(buf2);
9743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744}
9745
9746/* --- Unicode Object Methods --------------------------------------------- */
9747
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009748PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750\n\
9751Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009752characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009753
9754static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009755unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009757 return fixup(self, fixtitle);
9758}
9759
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009760PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762\n\
9763Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +00009764have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765
9766static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +02009767unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009769 return fixup(self, fixcapitalize);
9770}
9771
9772#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009773PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009775\n\
9776Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009777normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009778
9779static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00009780unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781{
9782 PyObject *list;
9783 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009784 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785
Guido van Rossumd57fd912000-03-10 22:53:23 +00009786 /* Split into words */
9787 list = split(self, NULL, -1);
9788 if (!list)
9789 return NULL;
9790
9791 /* Capitalize each word */
9792 for (i = 0; i < PyList_GET_SIZE(list); i++) {
9793 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +00009794 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795 if (item == NULL)
9796 goto onError;
9797 Py_DECREF(PyList_GET_ITEM(list, i));
9798 PyList_SET_ITEM(list, i, item);
9799 }
9800
9801 /* Join the words to form a new string */
9802 item = PyUnicode_Join(NULL, list);
9803
Benjamin Peterson29060642009-01-31 22:14:21 +00009804 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00009805 Py_DECREF(list);
9806 return (PyObject *)item;
9807}
9808#endif
9809
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009810/* Argument converter. Coerces to a single unicode character */
9811
9812static int
9813convert_uc(PyObject *obj, void *addr)
9814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009816 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009817
Benjamin Peterson14339b62009-01-31 16:36:08 +00009818 uniobj = PyUnicode_FromObject(obj);
9819 if (uniobj == NULL) {
9820 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009821 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009822 return 0;
9823 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009824 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009825 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009826 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009827 Py_DECREF(uniobj);
9828 return 0;
9829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009830 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +00009831 Py_DECREF(uniobj);
9832 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009833}
9834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00009835PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +00009836 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00009837\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +00009838Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00009839done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840
9841static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009842unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009844 Py_ssize_t marg, left;
9845 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846 Py_UCS4 fillchar = ' ';
9847
Victor Stinnere9a29352011-10-01 02:14:59 +02009848 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009849 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850
Victor Stinnere9a29352011-10-01 02:14:59 +02009851 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852 return NULL;
9853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009855 Py_INCREF(self);
9856 return (PyObject*) self;
9857 }
9858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860 left = marg / 2 + (marg & width & 1);
9861
Victor Stinner9310abb2011-10-05 00:59:23 +02009862 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863}
9864
Marc-André Lemburge5034372000-08-08 08:04:29 +00009865#if 0
9866
9867/* This code should go into some future Unicode collation support
9868 module. The basic comparison should compare ordinals on a naive
Georg Brandlc6c31782009-06-08 13:41:29 +00009869 basis (this is what Java does and thus Jython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00009870
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009871/* speedy UTF-16 code point order comparison */
9872/* gleaned from: */
9873/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9874
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009875static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009876{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009877 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00009878 0, 0, 0, 0, 0, 0, 0, 0,
9879 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00009880 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009881};
9882
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883static int
9884unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9885{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009886 Py_ssize_t len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009887
Guido van Rossumd57fd912000-03-10 22:53:23 +00009888 Py_UNICODE *s1 = str1->str;
9889 Py_UNICODE *s2 = str2->str;
9890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 len1 = str1->_base._base.length;
9892 len2 = str2->_base._base.length;
Tim Petersced69f82003-09-16 20:30:58 +00009893
Guido van Rossumd57fd912000-03-10 22:53:23 +00009894 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00009895 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009896
9897 c1 = *s1++;
9898 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00009899
Benjamin Peterson29060642009-01-31 22:14:21 +00009900 if (c1 > (1<<11) * 26)
9901 c1 += utf16Fixup[c1>>11];
9902 if (c2 > (1<<11) * 26)
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009903 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009904 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00009905
9906 if (c1 != c2)
9907 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00009908
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00009909 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009910 }
9911
9912 return (len1 < len2) ? -1 : (len1 != len2);
9913}
9914
Marc-André Lemburge5034372000-08-08 08:04:29 +00009915#else
9916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917/* This function assumes that str1 and str2 are readied by the caller. */
9918
Marc-André Lemburge5034372000-08-08 08:04:29 +00009919static int
9920unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9921{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922 int kind1, kind2;
9923 void *data1, *data2;
9924 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009926 kind1 = PyUnicode_KIND(str1);
9927 kind2 = PyUnicode_KIND(str2);
9928 data1 = PyUnicode_DATA(str1);
9929 data2 = PyUnicode_DATA(str2);
9930 len1 = PyUnicode_GET_LENGTH(str1);
9931 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +00009932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933 for (i = 0; i < len1 && i < len2; ++i) {
9934 Py_UCS4 c1, c2;
9935 c1 = PyUnicode_READ(kind1, data1, i);
9936 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +00009937
9938 if (c1 != c2)
9939 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +00009940 }
9941
9942 return (len1 < len2) ? -1 : (len1 != len2);
9943}
9944
9945#endif
9946
Alexander Belopolsky40018472011-02-26 01:02:56 +00009947int
9948PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009949{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9951 if (PyUnicode_READY(left) == -1 ||
9952 PyUnicode_READY(right) == -1)
9953 return -1;
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009954 return unicode_compare((PyUnicodeObject *)left,
9955 (PyUnicodeObject *)right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +00009957 PyErr_Format(PyExc_TypeError,
9958 "Can't compare %.100s and %.100s",
9959 left->ob_type->tp_name,
9960 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961 return -1;
9962}
9963
Martin v. Löwis5b222132007-06-10 09:51:05 +00009964int
9965PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 Py_ssize_t i;
9968 int kind;
9969 void *data;
9970 Py_UCS4 chr;
9971
Victor Stinner910337b2011-10-03 03:20:16 +02009972 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 if (PyUnicode_READY(uni) == -1)
9974 return -1;
9975 kind = PyUnicode_KIND(uni);
9976 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +00009977 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009978 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9979 if (chr != str[i])
9980 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +00009981 /* This check keeps Python strings that end in '\0' from comparing equal
9982 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +00009984 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009985 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +00009986 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +00009987 return 0;
9988}
9989
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009990
Benjamin Peterson29060642009-01-31 22:14:21 +00009991#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +00009992 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009993
Alexander Belopolsky40018472011-02-26 01:02:56 +00009994PyObject *
9995PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +00009996{
9997 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009998
Antoine Pitrou51f3ef92008-12-20 13:14:23 +00009999 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10000 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 if (PyUnicode_READY(left) == -1 ||
10002 PyUnicode_READY(right) == -1)
10003 return NULL;
10004 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10005 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010006 if (op == Py_EQ) {
10007 Py_INCREF(Py_False);
10008 return Py_False;
10009 }
10010 if (op == Py_NE) {
10011 Py_INCREF(Py_True);
10012 return Py_True;
10013 }
10014 }
10015 if (left == right)
10016 result = 0;
10017 else
10018 result = unicode_compare((PyUnicodeObject *)left,
10019 (PyUnicodeObject *)right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010020
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010021 /* Convert the return value to a Boolean */
10022 switch (op) {
10023 case Py_EQ:
10024 v = TEST_COND(result == 0);
10025 break;
10026 case Py_NE:
10027 v = TEST_COND(result != 0);
10028 break;
10029 case Py_LE:
10030 v = TEST_COND(result <= 0);
10031 break;
10032 case Py_GE:
10033 v = TEST_COND(result >= 0);
10034 break;
10035 case Py_LT:
10036 v = TEST_COND(result == -1);
10037 break;
10038 case Py_GT:
10039 v = TEST_COND(result == 1);
10040 break;
10041 default:
10042 PyErr_BadArgument();
10043 return NULL;
10044 }
10045 Py_INCREF(v);
10046 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010047 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010048
Brian Curtindfc80e32011-08-10 20:28:54 -050010049 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010050}
10051
Alexander Belopolsky40018472011-02-26 01:02:56 +000010052int
10053PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010054{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010055 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 int kind1, kind2, kind;
10057 void *buf1, *buf2;
10058 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010059 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010060
10061 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 sub = PyUnicode_FromObject(element);
10063 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010064 PyErr_Format(PyExc_TypeError,
10065 "'in <string>' requires string as left operand, not %s",
10066 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010067 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010068 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010069 if (PyUnicode_READY(sub) == -1)
10070 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010071
Thomas Wouters477c8d52006-05-27 19:21:47 +000010072 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010073 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010074 Py_DECREF(sub);
10075 return -1;
10076 }
10077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 kind1 = PyUnicode_KIND(str);
10079 kind2 = PyUnicode_KIND(sub);
10080 kind = kind1 > kind2 ? kind1 : kind2;
10081 buf1 = PyUnicode_DATA(str);
10082 buf2 = PyUnicode_DATA(sub);
10083 if (kind1 != kind)
10084 buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
10085 if (!buf1) {
10086 Py_DECREF(sub);
10087 return -1;
10088 }
10089 if (kind2 != kind)
10090 buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
10091 if (!buf2) {
10092 Py_DECREF(sub);
10093 if (kind1 != kind) PyMem_Free(buf1);
10094 return -1;
10095 }
10096 len1 = PyUnicode_GET_LENGTH(str);
10097 len2 = PyUnicode_GET_LENGTH(sub);
10098
10099 switch(kind) {
10100 case PyUnicode_1BYTE_KIND:
10101 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10102 break;
10103 case PyUnicode_2BYTE_KIND:
10104 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10105 break;
10106 case PyUnicode_4BYTE_KIND:
10107 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10108 break;
10109 default:
10110 result = -1;
10111 assert(0);
10112 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010113
10114 Py_DECREF(str);
10115 Py_DECREF(sub);
10116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010117 if (kind1 != kind)
10118 PyMem_Free(buf1);
10119 if (kind2 != kind)
10120 PyMem_Free(buf2);
10121
Guido van Rossum403d68b2000-03-13 15:55:09 +000010122 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010123}
10124
Guido van Rossumd57fd912000-03-10 22:53:23 +000010125/* Concat to string or Unicode object giving a new Unicode object. */
10126
Alexander Belopolsky40018472011-02-26 01:02:56 +000010127PyObject *
10128PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010129{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 PyObject *u = NULL, *v = NULL, *w;
10131 Py_UCS4 maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010132
10133 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010136 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010137 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010138 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010139 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010140
10141 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010142 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010143 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010145 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010146 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010147 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 }
10150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010151 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinnerff9e50f2011-09-28 22:17:19 +020010152 maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153
Guido van Rossumd57fd912000-03-10 22:53:23 +000010154 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010155 w = PyUnicode_New(
10156 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10157 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010158 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010159 goto onError;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010160 if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10161 goto onError;
Victor Stinner157f83f2011-09-28 21:41:31 +020010162 if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010163 v, 0,
10164 PyUnicode_GET_LENGTH(v)) < 0)
10165 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166 Py_DECREF(u);
10167 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010168 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
Benjamin Peterson29060642009-01-31 22:14:21 +000010171 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172 Py_XDECREF(u);
10173 Py_XDECREF(v);
10174 return NULL;
10175}
10176
Victor Stinnerb0923652011-10-04 01:17:31 +020010177static void
10178unicode_append_inplace(PyObject **p_left, PyObject *right)
10179{
10180 Py_ssize_t left_len, right_len, new_len;
10181#ifdef Py_DEBUG
10182 Py_ssize_t copied;
10183#endif
10184
10185 assert(PyUnicode_IS_READY(*p_left));
10186 assert(PyUnicode_IS_READY(right));
10187
10188 left_len = PyUnicode_GET_LENGTH(*p_left);
10189 right_len = PyUnicode_GET_LENGTH(right);
10190 if (left_len > PY_SSIZE_T_MAX - right_len) {
10191 PyErr_SetString(PyExc_OverflowError,
10192 "strings are too large to concat");
10193 goto error;
10194 }
10195 new_len = left_len + right_len;
10196
10197 /* Now we own the last reference to 'left', so we can resize it
10198 * in-place.
10199 */
10200 if (unicode_resize(p_left, new_len) != 0) {
10201 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10202 * deallocated so it cannot be put back into
10203 * 'variable'. The MemoryError is raised when there
10204 * is no value in 'variable', which might (very
10205 * remotely) be a cause of incompatibilities.
10206 */
10207 goto error;
10208 }
10209 /* copy 'right' into the newly allocated area of 'left' */
10210#ifdef Py_DEBUG
10211 copied = PyUnicode_CopyCharacters(*p_left, left_len,
10212 right, 0,
10213 right_len);
10214 assert(0 <= copied);
10215#else
10216 PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10217#endif
10218 return;
10219
10220error:
10221 Py_DECREF(*p_left);
10222 *p_left = NULL;
10223}
10224
Walter Dörwald1ab83302007-05-18 17:15:44 +000010225void
Victor Stinner23e56682011-10-03 03:54:37 +020010226PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010227{
Victor Stinner23e56682011-10-03 03:54:37 +020010228 PyObject *left, *res;
10229
10230 if (p_left == NULL) {
10231 if (!PyErr_Occurred())
10232 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010233 return;
10234 }
Victor Stinner23e56682011-10-03 03:54:37 +020010235 left = *p_left;
10236 if (right == NULL || !PyUnicode_Check(left)) {
10237 if (!PyErr_Occurred())
10238 PyErr_BadInternalCall();
10239 goto error;
10240 }
10241
Victor Stinnere1335c72011-10-04 20:53:03 +020010242 if (PyUnicode_READY(left))
10243 goto error;
10244 if (PyUnicode_READY(right))
10245 goto error;
10246
Victor Stinner23e56682011-10-03 03:54:37 +020010247 if (PyUnicode_CheckExact(left) && left != unicode_empty
10248 && PyUnicode_CheckExact(right) && right != unicode_empty
10249 && unicode_resizable(left)
10250 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10251 || _PyUnicode_WSTR(left) != NULL))
10252 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010253 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10254 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010255 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010256 not so different than duplicating the string. */
10257 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010258 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010259 unicode_append_inplace(p_left, right);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010260 if (p_left != NULL)
10261 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010262 return;
10263 }
10264 }
10265
10266 res = PyUnicode_Concat(left, right);
10267 if (res == NULL)
10268 goto error;
10269 Py_DECREF(left);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010270 assert(_PyUnicode_CheckConsistency(res, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010271 *p_left = res;
10272 return;
10273
10274error:
10275 Py_DECREF(*p_left);
10276 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010277}
10278
10279void
10280PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10281{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010282 PyUnicode_Append(pleft, right);
10283 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010284}
10285
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010286PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010287 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010289Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010290string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010291interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292
10293static PyObject *
10294unicode_count(PyUnicodeObject *self, PyObject *args)
10295{
10296 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010297 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010298 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010299 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 int kind1, kind2, kind;
10301 void *buf1, *buf2;
10302 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303
Jesus Ceaac451502011-04-20 17:09:23 +020010304 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10305 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010306 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 kind1 = PyUnicode_KIND(self);
10309 kind2 = PyUnicode_KIND(substring);
10310 kind = kind1 > kind2 ? kind1 : kind2;
10311 buf1 = PyUnicode_DATA(self);
10312 buf2 = PyUnicode_DATA(substring);
10313 if (kind1 != kind)
10314 buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10315 if (!buf1) {
10316 Py_DECREF(substring);
10317 return NULL;
10318 }
10319 if (kind2 != kind)
10320 buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10321 if (!buf2) {
10322 Py_DECREF(substring);
10323 if (kind1 != kind) PyMem_Free(buf1);
10324 return NULL;
10325 }
10326 len1 = PyUnicode_GET_LENGTH(self);
10327 len2 = PyUnicode_GET_LENGTH(substring);
10328
10329 ADJUST_INDICES(start, end, len1);
10330 switch(kind) {
10331 case PyUnicode_1BYTE_KIND:
10332 iresult = ucs1lib_count(
10333 ((Py_UCS1*)buf1) + start, end - start,
10334 buf2, len2, PY_SSIZE_T_MAX
10335 );
10336 break;
10337 case PyUnicode_2BYTE_KIND:
10338 iresult = ucs2lib_count(
10339 ((Py_UCS2*)buf1) + start, end - start,
10340 buf2, len2, PY_SSIZE_T_MAX
10341 );
10342 break;
10343 case PyUnicode_4BYTE_KIND:
10344 iresult = ucs4lib_count(
10345 ((Py_UCS4*)buf1) + start, end - start,
10346 buf2, len2, PY_SSIZE_T_MAX
10347 );
10348 break;
10349 default:
10350 assert(0); iresult = 0;
10351 }
10352
10353 result = PyLong_FromSsize_t(iresult);
10354
10355 if (kind1 != kind)
10356 PyMem_Free(buf1);
10357 if (kind2 != kind)
10358 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359
10360 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010361
Guido van Rossumd57fd912000-03-10 22:53:23 +000010362 return result;
10363}
10364
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010365PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010366 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010368Encode S using the codec registered for encoding. Default encoding\n\
10369is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010370handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010371a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10372'xmlcharrefreplace' as well as any other name registered with\n\
10373codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010374
10375static PyObject *
Benjamin Peterson308d6372009-09-18 21:42:35 +000010376unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010377{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010378 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010379 char *encoding = NULL;
10380 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010381
Benjamin Peterson308d6372009-09-18 21:42:35 +000010382 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10383 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384 return NULL;
Georg Brandl3b9406b2010-12-03 07:54:09 +000010385 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010386}
10387
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010388PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010389 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010390\n\
10391Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010392If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010393
10394static PyObject*
10395unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10396{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010397 Py_ssize_t i, j, line_pos, src_len, incr;
10398 Py_UCS4 ch;
10399 PyObject *u;
10400 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010401 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010402 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010403 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010404
10405 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407
Antoine Pitrou22425222011-10-04 19:10:51 +020010408 if (PyUnicode_READY(self) == -1)
10409 return NULL;
10410
Thomas Wouters7e474022000-07-16 12:04:32 +000010411 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010412 src_len = PyUnicode_GET_LENGTH(self);
10413 i = j = line_pos = 0;
10414 kind = PyUnicode_KIND(self);
10415 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010416 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010417 for (; i < src_len; i++) {
10418 ch = PyUnicode_READ(kind, src_data, i);
10419 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010420 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010421 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010422 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010423 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010424 goto overflow;
10425 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010426 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010427 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010429 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010430 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010431 goto overflow;
10432 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010433 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010434 if (ch == '\n' || ch == '\r')
10435 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010436 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010437 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010438 if (!found && PyUnicode_CheckExact(self)) {
10439 Py_INCREF((PyObject *) self);
10440 return (PyObject *) self;
10441 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010442
Guido van Rossumd57fd912000-03-10 22:53:23 +000010443 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010444 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010445 if (!u)
10446 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010447 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Antoine Pitroue71d5742011-10-04 15:55:09 +020010449 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010450
Antoine Pitroue71d5742011-10-04 15:55:09 +020010451 for (; i < src_len; i++) {
10452 ch = PyUnicode_READ(kind, src_data, i);
10453 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010454 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010455 incr = tabsize - (line_pos % tabsize);
10456 line_pos += incr;
10457 while (incr--) {
10458 PyUnicode_WRITE(kind, dest_data, j, ' ');
10459 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010460 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010462 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010463 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010464 line_pos++;
10465 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010466 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010467 if (ch == '\n' || ch == '\r')
10468 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010469 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010470 }
10471 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinner17efeed2011-10-04 20:05:46 +020010472#ifndef DONT_MAKE_RESULT_READY
10473 if (_PyUnicode_READY_REPLACE(&u)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474 Py_DECREF(u);
10475 return NULL;
10476 }
Victor Stinner17efeed2011-10-04 20:05:46 +020010477#endif
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010478 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 return (PyObject*) u;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010480
Antoine Pitroue71d5742011-10-04 15:55:09 +020010481 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010482 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484}
10485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010486PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010487 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010488\n\
10489Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010490such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010491arguments start and end are interpreted as in slice notation.\n\
10492\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010493Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010494
10495static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497{
Jesus Ceaac451502011-04-20 17:09:23 +020010498 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010499 Py_ssize_t start;
10500 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010501 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010502
Jesus Ceaac451502011-04-20 17:09:23 +020010503 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10504 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507 if (PyUnicode_READY(self) == -1)
10508 return NULL;
10509 if (PyUnicode_READY(substring) == -1)
10510 return NULL;
10511
10512 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010513 asciilib_find_slice, ucs1lib_find_slice,
10514 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010516 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517
10518 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (result == -2)
10521 return NULL;
10522
Christian Heimes217cfd12007-12-02 14:31:20 +000010523 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524}
10525
10526static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010527unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010529 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10530 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010531 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010533}
10534
Guido van Rossumc2504932007-09-18 19:42:40 +000010535/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010536 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010537static Py_hash_t
Neil Schemenauerf8c37d12007-09-07 20:49:04 +000010538unicode_hash(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539{
Guido van Rossumc2504932007-09-18 19:42:40 +000010540 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010541 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010542
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (_PyUnicode_HASH(self) != -1)
10544 return _PyUnicode_HASH(self);
10545 if (PyUnicode_READY(self) == -1)
10546 return -1;
10547 len = PyUnicode_GET_LENGTH(self);
10548
10549 /* The hash function as a macro, gets expanded three times below. */
10550#define HASH(P) \
10551 x = (Py_uhash_t)*P << 7; \
10552 while (--len >= 0) \
10553 x = (1000003*x) ^ (Py_uhash_t)*P++;
10554
10555 switch (PyUnicode_KIND(self)) {
10556 case PyUnicode_1BYTE_KIND: {
10557 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10558 HASH(c);
10559 break;
10560 }
10561 case PyUnicode_2BYTE_KIND: {
10562 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10563 HASH(s);
10564 break;
10565 }
10566 default: {
10567 Py_UCS4 *l;
10568 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10569 "Impossible switch case in unicode_hash");
10570 l = PyUnicode_4BYTE_DATA(self);
10571 HASH(l);
10572 break;
10573 }
10574 }
10575 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10576
Guido van Rossumc2504932007-09-18 19:42:40 +000010577 if (x == -1)
10578 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010580 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000010583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010584PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010587Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588
10589static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010591{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010592 Py_ssize_t result;
Jesus Ceaac451502011-04-20 17:09:23 +020010593 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010594 Py_ssize_t start;
10595 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Jesus Ceaac451502011-04-20 17:09:23 +020010597 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10598 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (PyUnicode_READY(self) == -1)
10602 return NULL;
10603 if (PyUnicode_READY(substring) == -1)
10604 return NULL;
10605
10606 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020010607 asciilib_find_slice, ucs1lib_find_slice,
10608 ucs2lib_find_slice, ucs4lib_find_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (result == -2)
10615 return NULL;
10616
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 if (result < 0) {
10618 PyErr_SetString(PyExc_ValueError, "substring not found");
10619 return NULL;
10620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010621
Christian Heimes217cfd12007-12-02 14:31:20 +000010622 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623}
10624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010625PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010626 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010628Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010629at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010630
10631static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010632unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010633{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 Py_ssize_t i, length;
10635 int kind;
10636 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 int cased;
10638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010639 if (PyUnicode_READY(self) == -1)
10640 return NULL;
10641 length = PyUnicode_GET_LENGTH(self);
10642 kind = PyUnicode_KIND(self);
10643 data = PyUnicode_DATA(self);
10644
Guido van Rossumd57fd912000-03-10 22:53:23 +000010645 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 if (length == 1)
10647 return PyBool_FromLong(
10648 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010649
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010650 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010652 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010653
Guido van Rossumd57fd912000-03-10 22:53:23 +000010654 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 for (i = 0; i < length; i++) {
10656 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010657
Benjamin Peterson29060642009-01-31 22:14:21 +000010658 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10659 return PyBool_FromLong(0);
10660 else if (!cased && Py_UNICODE_ISLOWER(ch))
10661 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010662 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010663 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010664}
10665
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010666PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010667 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010668\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010669Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010670at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010671
10672static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010673unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010674{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010675 Py_ssize_t i, length;
10676 int kind;
10677 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 int cased;
10679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (PyUnicode_READY(self) == -1)
10681 return NULL;
10682 length = PyUnicode_GET_LENGTH(self);
10683 kind = PyUnicode_KIND(self);
10684 data = PyUnicode_DATA(self);
10685
Guido van Rossumd57fd912000-03-10 22:53:23 +000010686 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 if (length == 1)
10688 return PyBool_FromLong(
10689 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010690
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010691 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010693 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010694
Guido van Rossumd57fd912000-03-10 22:53:23 +000010695 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 for (i = 0; i < length; i++) {
10697 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010698
Benjamin Peterson29060642009-01-31 22:14:21 +000010699 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10700 return PyBool_FromLong(0);
10701 else if (!cased && Py_UNICODE_ISUPPER(ch))
10702 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010703 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010704 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705}
10706
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010707PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010708 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010710Return True if S is a titlecased string and there is at least one\n\
10711character in S, i.e. upper- and titlecase characters may only\n\
10712follow uncased characters and lowercase characters only cased ones.\n\
10713Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010714
10715static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010716unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010717{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 Py_ssize_t i, length;
10719 int kind;
10720 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010721 int cased, previous_is_cased;
10722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (PyUnicode_READY(self) == -1)
10724 return NULL;
10725 length = PyUnicode_GET_LENGTH(self);
10726 kind = PyUnicode_KIND(self);
10727 data = PyUnicode_DATA(self);
10728
Guido van Rossumd57fd912000-03-10 22:53:23 +000010729 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 if (length == 1) {
10731 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10732 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10733 (Py_UNICODE_ISUPPER(ch) != 0));
10734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010736 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010738 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010739
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 cased = 0;
10741 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 for (i = 0; i < length; i++) {
10743 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000010744
Benjamin Peterson29060642009-01-31 22:14:21 +000010745 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10746 if (previous_is_cased)
10747 return PyBool_FromLong(0);
10748 previous_is_cased = 1;
10749 cased = 1;
10750 }
10751 else if (Py_UNICODE_ISLOWER(ch)) {
10752 if (!previous_is_cased)
10753 return PyBool_FromLong(0);
10754 previous_is_cased = 1;
10755 cased = 1;
10756 }
10757 else
10758 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010760 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761}
10762
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010763PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010764 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010766Return True if all characters in S are whitespace\n\
10767and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010768
10769static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010770unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 Py_ssize_t i, length;
10773 int kind;
10774 void *data;
10775
10776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 length = PyUnicode_GET_LENGTH(self);
10779 kind = PyUnicode_KIND(self);
10780 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781
Guido van Rossumd57fd912000-03-10 22:53:23 +000010782 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 if (length == 1)
10784 return PyBool_FromLong(
10785 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010787 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010788 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010789 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010790
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010791 for (i = 0; i < length; i++) {
10792 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010793 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010794 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010795 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010796 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797}
10798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010799PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010800 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010801\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010802Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010803and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010804
10805static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010806unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 Py_ssize_t i, length;
10809 int kind;
10810 void *data;
10811
10812 if (PyUnicode_READY(self) == -1)
10813 return NULL;
10814 length = PyUnicode_GET_LENGTH(self);
10815 kind = PyUnicode_KIND(self);
10816 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010817
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010818 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010819 if (length == 1)
10820 return PyBool_FromLong(
10821 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010822
10823 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010825 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010827 for (i = 0; i < length; i++) {
10828 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010829 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010830 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010831 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010832}
10833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010834PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010835 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010836\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010837Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010838and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010839
10840static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010841unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010842{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int kind;
10844 void *data;
10845 Py_ssize_t len, i;
10846
10847 if (PyUnicode_READY(self) == -1)
10848 return NULL;
10849
10850 kind = PyUnicode_KIND(self);
10851 data = PyUnicode_DATA(self);
10852 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010853
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010854 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855 if (len == 1) {
10856 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10857 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10858 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010859
10860 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010861 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010862 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010863
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010864 for (i = 0; i < len; i++) {
10865 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010866 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000010867 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010868 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010869 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000010870}
10871
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010872PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010873 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010875Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010876False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010877
10878static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010879unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010880{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 Py_ssize_t i, length;
10882 int kind;
10883 void *data;
10884
10885 if (PyUnicode_READY(self) == -1)
10886 return NULL;
10887 length = PyUnicode_GET_LENGTH(self);
10888 kind = PyUnicode_KIND(self);
10889 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010890
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010892 if (length == 1)
10893 return PyBool_FromLong(
10894 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010895
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010896 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010898 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 for (i = 0; i < length; i++) {
10901 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010902 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010903 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010904 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010905}
10906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010907PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010908 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010909\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000010910Return True if all characters in S are digits\n\
10911and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010912
10913static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010914unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 Py_ssize_t i, length;
10917 int kind;
10918 void *data;
10919
10920 if (PyUnicode_READY(self) == -1)
10921 return NULL;
10922 length = PyUnicode_GET_LENGTH(self);
10923 kind = PyUnicode_KIND(self);
10924 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Guido van Rossumd57fd912000-03-10 22:53:23 +000010926 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927 if (length == 1) {
10928 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10929 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010932 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010934 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010935
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010936 for (i = 0; i < length; i++) {
10937 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010938 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010940 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010941}
10942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010943PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010945\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000010946Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010947False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010948
10949static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000010950unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 Py_ssize_t i, length;
10953 int kind;
10954 void *data;
10955
10956 if (PyUnicode_READY(self) == -1)
10957 return NULL;
10958 length = PyUnicode_GET_LENGTH(self);
10959 kind = PyUnicode_KIND(self);
10960 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010961
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 if (length == 1)
10964 return PyBool_FromLong(
10965 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010967 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010969 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000010970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 for (i = 0; i < length; i++) {
10972 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000010973 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000010975 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976}
10977
Martin v. Löwis47383402007-08-15 07:32:56 +000010978int
10979PyUnicode_IsIdentifier(PyObject *self)
10980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 int kind;
10982 void *data;
10983 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030010984 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000010985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010986 if (PyUnicode_READY(self) == -1) {
10987 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000010988 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 }
10990
10991 /* Special case for empty strings */
10992 if (PyUnicode_GET_LENGTH(self) == 0)
10993 return 0;
10994 kind = PyUnicode_KIND(self);
10995 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000010996
10997 /* PEP 3131 says that the first character must be in
10998 XID_Start and subsequent characters in XID_Continue,
10999 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011000 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011001 letters, digits, underscore). However, given the current
11002 definition of XID_Start and XID_Continue, it is sufficient
11003 to check just for these, except that _ must be allowed
11004 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011006 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011007 return 0;
11008
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011009 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011011 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011012 return 1;
11013}
11014
11015PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011016 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011017\n\
11018Return True if S is a valid identifier according\n\
11019to the language definition.");
11020
11021static PyObject*
11022unicode_isidentifier(PyObject *self)
11023{
11024 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11025}
11026
Georg Brandl559e5d72008-06-11 18:37:52 +000011027PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011029\n\
11030Return True if all characters in S are considered\n\
11031printable in repr() or S is empty, False otherwise.");
11032
11033static PyObject*
11034unicode_isprintable(PyObject *self)
11035{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 Py_ssize_t i, length;
11037 int kind;
11038 void *data;
11039
11040 if (PyUnicode_READY(self) == -1)
11041 return NULL;
11042 length = PyUnicode_GET_LENGTH(self);
11043 kind = PyUnicode_KIND(self);
11044 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011045
11046 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011047 if (length == 1)
11048 return PyBool_FromLong(
11049 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011050
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 for (i = 0; i < length; i++) {
11052 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011053 Py_RETURN_FALSE;
11054 }
11055 }
11056 Py_RETURN_TRUE;
11057}
11058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011059PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011060 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011061\n\
11062Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011063iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011064
11065static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011066unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011067{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011068 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011069}
11070
Martin v. Löwis18e16552006-02-15 17:27:45 +000011071static Py_ssize_t
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072unicode_length(PyUnicodeObject *self)
11073{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (PyUnicode_READY(self) == -1)
11075 return -1;
11076 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077}
11078
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011079PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011080 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011081\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011082Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011083done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084
11085static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011086unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011088 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011089 Py_UCS4 fillchar = ' ';
11090
11091 if (PyUnicode_READY(self) == -1)
11092 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011093
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011094 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011095 return NULL;
11096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011098 Py_INCREF(self);
11099 return (PyObject*) self;
11100 }
11101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011103}
11104
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011105PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011106 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011107\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011108Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109
11110static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011111unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113 return fixup(self, fixlower);
11114}
11115
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011116#define LEFTSTRIP 0
11117#define RIGHTSTRIP 1
11118#define BOTHSTRIP 2
11119
11120/* Arrays indexed by above */
11121static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11122
11123#define STRIPNAME(i) (stripformat[i]+3)
11124
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011125/* externally visible for str.strip(unicode) */
11126PyObject *
11127_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011129 void *data;
11130 int kind;
11131 Py_ssize_t i, j, len;
11132 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11135 return NULL;
11136
11137 kind = PyUnicode_KIND(self);
11138 data = PyUnicode_DATA(self);
11139 len = PyUnicode_GET_LENGTH(self);
11140 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11141 PyUnicode_DATA(sepobj),
11142 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011143
Benjamin Peterson14339b62009-01-31 16:36:08 +000011144 i = 0;
11145 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 while (i < len &&
11147 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011148 i++;
11149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011150 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011151
Benjamin Peterson14339b62009-01-31 16:36:08 +000011152 j = len;
11153 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011154 do {
11155 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011156 } while (j >= i &&
11157 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011158 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011159 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011160
Victor Stinner12bab6d2011-10-01 01:53:49 +020011161 return PyUnicode_Substring((PyObject*)self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162}
11163
11164PyObject*
11165PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11166{
11167 unsigned char *data;
11168 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011169 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011170
Victor Stinnerde636f32011-10-01 03:55:54 +020011171 if (PyUnicode_READY(self) == -1)
11172 return NULL;
11173
11174 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11175
Victor Stinner12bab6d2011-10-01 01:53:49 +020011176 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011178 if (PyUnicode_CheckExact(self)) {
11179 Py_INCREF(self);
11180 return self;
11181 }
11182 else
11183 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011184 }
11185
Victor Stinner12bab6d2011-10-01 01:53:49 +020011186 length = end - start;
11187 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011188 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011189
Victor Stinnerde636f32011-10-01 03:55:54 +020011190 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011191 PyErr_SetString(PyExc_IndexError, "string index out of range");
11192 return NULL;
11193 }
11194
Victor Stinnerb9275c12011-10-05 14:01:42 +020011195 if (PyUnicode_IS_ASCII(self)) {
11196 kind = PyUnicode_KIND(self);
11197 data = PyUnicode_1BYTE_DATA(self);
11198 return unicode_fromascii(data + start, length);
11199 }
11200 else {
11201 kind = PyUnicode_KIND(self);
11202 data = PyUnicode_1BYTE_DATA(self);
11203 return PyUnicode_FromKindAndData(kind,
11204 data + PyUnicode_KIND_SIZE(kind, start),
11205 length);
11206 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011207}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011208
11209static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011210do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011212 int kind;
11213 void *data;
11214 Py_ssize_t len, i, j;
11215
11216 if (PyUnicode_READY(self) == -1)
11217 return NULL;
11218
11219 kind = PyUnicode_KIND(self);
11220 data = PyUnicode_DATA(self);
11221 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011222
Benjamin Peterson14339b62009-01-31 16:36:08 +000011223 i = 0;
11224 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011226 i++;
11227 }
11228 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011229
Benjamin Peterson14339b62009-01-31 16:36:08 +000011230 j = len;
11231 if (striptype != LEFTSTRIP) {
11232 do {
11233 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011235 j++;
11236 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011237
Victor Stinner12bab6d2011-10-01 01:53:49 +020011238 return PyUnicode_Substring((PyObject*)self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011239}
11240
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011241
11242static PyObject *
11243do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11244{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011245 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011246
Benjamin Peterson14339b62009-01-31 16:36:08 +000011247 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11248 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011249
Benjamin Peterson14339b62009-01-31 16:36:08 +000011250 if (sep != NULL && sep != Py_None) {
11251 if (PyUnicode_Check(sep))
11252 return _PyUnicode_XStrip(self, striptype, sep);
11253 else {
11254 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 "%s arg must be None or str",
11256 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011257 return NULL;
11258 }
11259 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011260
Benjamin Peterson14339b62009-01-31 16:36:08 +000011261 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011262}
11263
11264
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011265PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011267\n\
11268Return a copy of the string S with leading and trailing\n\
11269whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011270If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011271
11272static PyObject *
11273unicode_strip(PyUnicodeObject *self, PyObject *args)
11274{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011275 if (PyTuple_GET_SIZE(args) == 0)
11276 return do_strip(self, BOTHSTRIP); /* Common case */
11277 else
11278 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011279}
11280
11281
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011282PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011283 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011284\n\
11285Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011286If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011287
11288static PyObject *
11289unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11290{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011291 if (PyTuple_GET_SIZE(args) == 0)
11292 return do_strip(self, LEFTSTRIP); /* Common case */
11293 else
11294 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011295}
11296
11297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011298PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011300\n\
11301Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011302If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011303
11304static PyObject *
11305unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11306{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011307 if (PyTuple_GET_SIZE(args) == 0)
11308 return do_strip(self, RIGHTSTRIP); /* Common case */
11309 else
11310 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011311}
11312
11313
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314static PyObject*
Martin v. Löwis18e16552006-02-15 17:27:45 +000011315unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
11317 PyUnicodeObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011318 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011319
Georg Brandl222de0f2009-04-12 12:01:50 +000011320 if (len < 1) {
11321 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011322 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324
Tim Peters7a29bd52001-09-12 03:03:31 +000011325 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 /* no repeat, return original string */
11327 Py_INCREF(str);
11328 return (PyObject*) str;
11329 }
Tim Peters8f422462000-09-09 06:13:41 +000011330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 if (PyUnicode_READY(str) == -1)
11332 return NULL;
11333
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011334 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011335 PyErr_SetString(PyExc_OverflowError,
11336 "repeated string is too long");
11337 return NULL;
11338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011340
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011341 u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011342 if (!u)
11343 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011344 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 if (PyUnicode_GET_LENGTH(str) == 1) {
11347 const int kind = PyUnicode_KIND(str);
11348 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11349 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011350 if (kind == PyUnicode_1BYTE_KIND)
11351 memset(to, (unsigned char)fill_char, len);
11352 else {
11353 for (n = 0; n < len; ++n)
11354 PyUnicode_WRITE(kind, to, n, fill_char);
11355 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 }
11357 else {
11358 /* number of characters copied this far */
11359 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11360 const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11361 char *to = (char *) PyUnicode_DATA(u);
11362 Py_MEMCPY(to, PyUnicode_DATA(str),
11363 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011364 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 n = (done <= nchars-done) ? done : nchars-done;
11366 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011367 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369 }
11370
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011371 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011372 return (PyObject*) u;
11373}
11374
Alexander Belopolsky40018472011-02-26 01:02:56 +000011375PyObject *
11376PyUnicode_Replace(PyObject *obj,
11377 PyObject *subobj,
11378 PyObject *replobj,
11379 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380{
11381 PyObject *self;
11382 PyObject *str1;
11383 PyObject *str2;
11384 PyObject *result;
11385
11386 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011387 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011388 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011390 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011391 Py_DECREF(self);
11392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393 }
11394 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011395 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 Py_DECREF(self);
11397 Py_DECREF(str1);
11398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011400 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401 Py_DECREF(self);
11402 Py_DECREF(str1);
11403 Py_DECREF(str2);
11404 return result;
11405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011408 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
11410Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011411old replaced by new. If the optional argument count is\n\
11412given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
11414static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011415unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011416{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 PyObject *str1;
11418 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011419 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420 PyObject *result;
11421
Martin v. Löwis18e16552006-02-15 17:27:45 +000011422 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011424 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 str1 = PyUnicode_FromObject(str1);
11427 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11428 return NULL;
11429 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011430 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011431 Py_DECREF(str1);
11432 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011433 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435 result = replace(self, str1, str2, maxcount);
11436
11437 Py_DECREF(str1);
11438 Py_DECREF(str2);
11439 return result;
11440}
11441
Alexander Belopolsky40018472011-02-26 01:02:56 +000011442static PyObject *
11443unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011445 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 Py_ssize_t isize;
11447 Py_ssize_t osize, squote, dquote, i, o;
11448 Py_UCS4 max, quote;
11449 int ikind, okind;
11450 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011453 return NULL;
11454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 isize = PyUnicode_GET_LENGTH(unicode);
11456 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 /* Compute length of output, quote characters, and
11459 maximum character */
11460 osize = 2; /* quotes */
11461 max = 127;
11462 squote = dquote = 0;
11463 ikind = PyUnicode_KIND(unicode);
11464 for (i = 0; i < isize; i++) {
11465 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11466 switch (ch) {
11467 case '\'': squote++; osize++; break;
11468 case '"': dquote++; osize++; break;
11469 case '\\': case '\t': case '\r': case '\n':
11470 osize += 2; break;
11471 default:
11472 /* Fast-path ASCII */
11473 if (ch < ' ' || ch == 0x7f)
11474 osize += 4; /* \xHH */
11475 else if (ch < 0x7f)
11476 osize++;
11477 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11478 osize++;
11479 max = ch > max ? ch : max;
11480 }
11481 else if (ch < 0x100)
11482 osize += 4; /* \xHH */
11483 else if (ch < 0x10000)
11484 osize += 6; /* \uHHHH */
11485 else
11486 osize += 10; /* \uHHHHHHHH */
11487 }
11488 }
11489
11490 quote = '\'';
11491 if (squote) {
11492 if (dquote)
11493 /* Both squote and dquote present. Use squote,
11494 and escape them */
11495 osize += squote;
11496 else
11497 quote = '"';
11498 }
11499
11500 repr = PyUnicode_New(osize, max);
11501 if (repr == NULL)
11502 return NULL;
11503 okind = PyUnicode_KIND(repr);
11504 odata = PyUnicode_DATA(repr);
11505
11506 PyUnicode_WRITE(okind, odata, 0, quote);
11507 PyUnicode_WRITE(okind, odata, osize-1, quote);
11508
11509 for (i = 0, o = 1; i < isize; i++) {
11510 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011511
11512 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011513 if ((ch == quote) || (ch == '\\')) {
11514 PyUnicode_WRITE(okind, odata, o++, '\\');
11515 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011516 continue;
11517 }
11518
Benjamin Peterson29060642009-01-31 22:14:21 +000011519 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011520 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011521 PyUnicode_WRITE(okind, odata, o++, '\\');
11522 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011523 }
11524 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011525 PyUnicode_WRITE(okind, odata, o++, '\\');
11526 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011527 }
11528 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011529 PyUnicode_WRITE(okind, odata, o++, '\\');
11530 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011531 }
11532
11533 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011534 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 PyUnicode_WRITE(okind, odata, o++, '\\');
11536 PyUnicode_WRITE(okind, odata, o++, 'x');
11537 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11538 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011539 }
11540
Georg Brandl559e5d72008-06-11 18:37:52 +000011541 /* Copy ASCII characters as-is */
11542 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011543 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011544 }
11545
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011547 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011548 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011549 (categories Z* and C* except ASCII space)
11550 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011552 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (ch <= 0xff) {
11554 PyUnicode_WRITE(okind, odata, o++, '\\');
11555 PyUnicode_WRITE(okind, odata, o++, 'x');
11556 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11557 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011558 }
11559 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 else if (ch >= 0x10000) {
11561 PyUnicode_WRITE(okind, odata, o++, '\\');
11562 PyUnicode_WRITE(okind, odata, o++, 'U');
11563 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11564 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11565 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11566 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11567 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11568 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11569 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11570 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011571 }
11572 /* Map 16-bit characters to '\uxxxx' */
11573 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011574 PyUnicode_WRITE(okind, odata, o++, '\\');
11575 PyUnicode_WRITE(okind, odata, o++, 'u');
11576 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11577 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11578 PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11579 PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011580 }
11581 }
11582 /* Copy characters as-is */
11583 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011584 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011585 }
11586 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000011587 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011588 /* Closing quote already added at the beginning */
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011589 assert(_PyUnicode_CheckConsistency(unicode, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000011590 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591}
11592
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011593PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595\n\
11596Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011597such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011598arguments start and end are interpreted as in slice notation.\n\
11599\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011600Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
11602static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011603unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604{
Jesus Ceaac451502011-04-20 17:09:23 +020011605 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011606 Py_ssize_t start;
11607 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011608 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
Jesus Ceaac451502011-04-20 17:09:23 +020011610 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11611 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011612 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 if (PyUnicode_READY(self) == -1)
11615 return NULL;
11616 if (PyUnicode_READY(substring) == -1)
11617 return NULL;
11618
11619 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011620 asciilib_rfind_slice, ucs1lib_rfind_slice,
11621 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
11625 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627 if (result == -2)
11628 return NULL;
11629
Christian Heimes217cfd12007-12-02 14:31:20 +000011630 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631}
11632
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011633PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011634 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011635\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011636Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
11638static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Jesus Ceaac451502011-04-20 17:09:23 +020011641 PyUnicodeObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011642 Py_ssize_t start;
11643 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011644 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645
Jesus Ceaac451502011-04-20 17:09:23 +020011646 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11647 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000011648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 if (PyUnicode_READY(self) == -1)
11651 return NULL;
11652 if (PyUnicode_READY(substring) == -1)
11653 return NULL;
11654
11655 result = any_find_slice(
Victor Stinnerc3cec782011-10-05 21:24:08 +020011656 asciilib_rfind_slice, ucs1lib_rfind_slice,
11657 ucs2lib_rfind_slice, ucs4lib_rfind_slice,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011658 self, (PyObject*)substring, start, end
Thomas Wouters477c8d52006-05-27 19:21:47 +000011659 );
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
11661 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011663 if (result == -2)
11664 return NULL;
11665
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666 if (result < 0) {
11667 PyErr_SetString(PyExc_ValueError, "substring not found");
11668 return NULL;
11669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670
Christian Heimes217cfd12007-12-02 14:31:20 +000011671 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672}
11673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011674PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011676\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011677Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011678done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679
11680static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011681unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011683 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 Py_UCS4 fillchar = ' ';
11685
Victor Stinnere9a29352011-10-01 02:14:59 +020011686 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011688
Victor Stinnere9a29352011-10-01 02:14:59 +020011689 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690 return NULL;
11691
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011692 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693 Py_INCREF(self);
11694 return (PyObject*) self;
11695 }
11696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698}
11699
Alexander Belopolsky40018472011-02-26 01:02:56 +000011700PyObject *
11701PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702{
11703 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000011704
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 s = PyUnicode_FromObject(s);
11706 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011707 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011708 if (sep != NULL) {
11709 sep = PyUnicode_FromObject(sep);
11710 if (sep == NULL) {
11711 Py_DECREF(s);
11712 return NULL;
11713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 }
11715
Victor Stinner9310abb2011-10-05 00:59:23 +020011716 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
11718 Py_DECREF(s);
11719 Py_XDECREF(sep);
11720 return result;
11721}
11722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011723PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725\n\
11726Return a list of the words in S, using sep as the\n\
11727delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000011728splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000011729whitespace string is a separator and empty strings are\n\
11730removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731
11732static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011733unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734{
11735 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011736 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
Martin v. Löwis18e16552006-02-15 17:27:45 +000011738 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739 return NULL;
11740
11741 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011742 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011744 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745 else
Benjamin Peterson29060642009-01-31 22:14:21 +000011746 return PyUnicode_Split((PyObject *)self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747}
11748
Thomas Wouters477c8d52006-05-27 19:21:47 +000011749PyObject *
11750PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11751{
11752 PyObject* str_obj;
11753 PyObject* sep_obj;
11754 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 int kind1, kind2, kind;
11756 void *buf1 = NULL, *buf2 = NULL;
11757 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011758
11759 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020011760 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011762 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011764 Py_DECREF(str_obj);
11765 return NULL;
11766 }
11767
Victor Stinner14f8f022011-10-05 20:58:25 +020011768 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020011770 kind = Py_MAX(kind1, kind2);
11771 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020011773 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 if (!buf1)
11775 goto onError;
11776 buf2 = PyUnicode_DATA(sep_obj);
11777 if (kind2 != kind)
11778 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11779 if (!buf2)
11780 goto onError;
11781 len1 = PyUnicode_GET_LENGTH(str_obj);
11782 len2 = PyUnicode_GET_LENGTH(sep_obj);
11783
Victor Stinner14f8f022011-10-05 20:58:25 +020011784 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011786 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11787 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11788 else
11789 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 break;
11791 case PyUnicode_2BYTE_KIND:
11792 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11793 break;
11794 case PyUnicode_4BYTE_KIND:
11795 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11796 break;
11797 default:
11798 assert(0);
11799 out = 0;
11800 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011801
11802 Py_DECREF(sep_obj);
11803 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (kind1 != kind)
11805 PyMem_Free(buf1);
11806 if (kind2 != kind)
11807 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011808
11809 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 onError:
11811 Py_DECREF(sep_obj);
11812 Py_DECREF(str_obj);
11813 if (kind1 != kind && buf1)
11814 PyMem_Free(buf1);
11815 if (kind2 != kind && buf2)
11816 PyMem_Free(buf2);
11817 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011818}
11819
11820
11821PyObject *
11822PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11823{
11824 PyObject* str_obj;
11825 PyObject* sep_obj;
11826 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 int kind1, kind2, kind;
11828 void *buf1 = NULL, *buf2 = NULL;
11829 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011830
11831 str_obj = PyUnicode_FromObject(str_in);
11832 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011834 sep_obj = PyUnicode_FromObject(sep_in);
11835 if (!sep_obj) {
11836 Py_DECREF(str_obj);
11837 return NULL;
11838 }
11839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 kind1 = PyUnicode_KIND(str_in);
11841 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020011842 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 buf1 = PyUnicode_DATA(str_in);
11844 if (kind1 != kind)
11845 buf1 = _PyUnicode_AsKind(str_in, kind);
11846 if (!buf1)
11847 goto onError;
11848 buf2 = PyUnicode_DATA(sep_obj);
11849 if (kind2 != kind)
11850 buf2 = _PyUnicode_AsKind(sep_obj, kind);
11851 if (!buf2)
11852 goto onError;
11853 len1 = PyUnicode_GET_LENGTH(str_obj);
11854 len2 = PyUnicode_GET_LENGTH(sep_obj);
11855
11856 switch(PyUnicode_KIND(str_in)) {
11857 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020011858 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
11859 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11860 else
11861 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 break;
11863 case PyUnicode_2BYTE_KIND:
11864 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11865 break;
11866 case PyUnicode_4BYTE_KIND:
11867 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11868 break;
11869 default:
11870 assert(0);
11871 out = 0;
11872 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011873
11874 Py_DECREF(sep_obj);
11875 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 if (kind1 != kind)
11877 PyMem_Free(buf1);
11878 if (kind2 != kind)
11879 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011880
11881 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 onError:
11883 Py_DECREF(sep_obj);
11884 Py_DECREF(str_obj);
11885 if (kind1 != kind && buf1)
11886 PyMem_Free(buf1);
11887 if (kind2 != kind && buf2)
11888 PyMem_Free(buf2);
11889 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011890}
11891
11892PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011893 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011894\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011895Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011896the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011897found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011898
11899static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011900unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011901{
Victor Stinner9310abb2011-10-05 00:59:23 +020011902 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011903}
11904
11905PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000011906 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011907\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000011908Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011909the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011910separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000011911
11912static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011913unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011914{
Victor Stinner9310abb2011-10-05 00:59:23 +020011915 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011916}
11917
Alexander Belopolsky40018472011-02-26 01:02:56 +000011918PyObject *
11919PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011920{
11921 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011922
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011923 s = PyUnicode_FromObject(s);
11924 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000011925 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 if (sep != NULL) {
11927 sep = PyUnicode_FromObject(sep);
11928 if (sep == NULL) {
11929 Py_DECREF(s);
11930 return NULL;
11931 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011932 }
11933
Victor Stinner9310abb2011-10-05 00:59:23 +020011934 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011935
11936 Py_DECREF(s);
11937 Py_XDECREF(sep);
11938 return result;
11939}
11940
11941PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011943\n\
11944Return a list of the words in S, using sep as the\n\
11945delimiter string, starting at the end of the string and\n\
11946working to the front. If maxsplit is given, at most maxsplit\n\
11947splits are done. If sep is not specified, any whitespace string\n\
11948is a separator.");
11949
11950static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011951unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011952{
11953 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011954 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011955
Martin v. Löwis18e16552006-02-15 17:27:45 +000011956 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011957 return NULL;
11958
11959 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011961 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020011962 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011963 else
Victor Stinner9310abb2011-10-05 00:59:23 +020011964 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000011965}
11966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011967PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011968 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969\n\
11970Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000011971Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011972is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973
11974static PyObject*
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011975unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011976{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011977 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000011978 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011979
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010011980 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11981 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011982 return NULL;
11983
Guido van Rossum86662912000-04-11 15:38:46 +000011984 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985}
11986
11987static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000011988PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011989{
Walter Dörwald346737f2007-05-31 10:44:43 +000011990 if (PyUnicode_CheckExact(self)) {
11991 Py_INCREF(self);
11992 return self;
11993 } else
11994 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020011995 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011996}
11997
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011998PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011999 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000\n\
12001Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012002and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
12004static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012005unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 return fixup(self, fixswapcase);
12008}
12009
Georg Brandlceee0772007-11-27 23:48:05 +000012010PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012011 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012012\n\
12013Return a translation table usable for str.translate().\n\
12014If there is only one argument, it must be a dictionary mapping Unicode\n\
12015ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012016Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012017If there are two arguments, they must be strings of equal length, and\n\
12018in the resulting dictionary, each character in x will be mapped to the\n\
12019character at the same position in y. If there is a third argument, it\n\
12020must be a string, whose characters will be mapped to None in the result.");
12021
12022static PyObject*
12023unicode_maketrans(PyUnicodeObject *null, PyObject *args)
12024{
12025 PyObject *x, *y = NULL, *z = NULL;
12026 PyObject *new = NULL, *key, *value;
12027 Py_ssize_t i = 0;
12028 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012029
Georg Brandlceee0772007-11-27 23:48:05 +000012030 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12031 return NULL;
12032 new = PyDict_New();
12033 if (!new)
12034 return NULL;
12035 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 int x_kind, y_kind, z_kind;
12037 void *x_data, *y_data, *z_data;
12038
Georg Brandlceee0772007-11-27 23:48:05 +000012039 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012040 if (!PyUnicode_Check(x)) {
12041 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12042 "be a string if there is a second argument");
12043 goto err;
12044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012045 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012046 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12047 "arguments must have equal length");
12048 goto err;
12049 }
12050 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012051 x_kind = PyUnicode_KIND(x);
12052 y_kind = PyUnicode_KIND(y);
12053 x_data = PyUnicode_DATA(x);
12054 y_data = PyUnicode_DATA(y);
12055 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12056 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12057 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012058 if (!key || !value)
12059 goto err;
12060 res = PyDict_SetItem(new, key, value);
12061 Py_DECREF(key);
12062 Py_DECREF(value);
12063 if (res < 0)
12064 goto err;
12065 }
12066 /* create entries for deleting chars in z */
12067 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012068 z_kind = PyUnicode_KIND(z);
12069 z_data = PyUnicode_DATA(z);
Georg Brandlceee0772007-11-27 23:48:05 +000012070 for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012072 if (!key)
12073 goto err;
12074 res = PyDict_SetItem(new, key, Py_None);
12075 Py_DECREF(key);
12076 if (res < 0)
12077 goto err;
12078 }
12079 }
12080 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 int kind;
12082 void *data;
12083
Georg Brandlceee0772007-11-27 23:48:05 +000012084 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012085 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012086 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12087 "to maketrans it must be a dict");
12088 goto err;
12089 }
12090 /* copy entries into the new dict, converting string keys to int keys */
12091 while (PyDict_Next(x, &i, &key, &value)) {
12092 if (PyUnicode_Check(key)) {
12093 /* convert string keys to integer keys */
12094 PyObject *newkey;
12095 if (PyUnicode_GET_SIZE(key) != 1) {
12096 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12097 "table must be of length 1");
12098 goto err;
12099 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 kind = PyUnicode_KIND(key);
12101 data = PyUnicode_DATA(key);
12102 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012103 if (!newkey)
12104 goto err;
12105 res = PyDict_SetItem(new, newkey, value);
12106 Py_DECREF(newkey);
12107 if (res < 0)
12108 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012109 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012110 /* just keep integer keys */
12111 if (PyDict_SetItem(new, key, value) < 0)
12112 goto err;
12113 } else {
12114 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12115 "be strings or integers");
12116 goto err;
12117 }
12118 }
12119 }
12120 return new;
12121 err:
12122 Py_DECREF(new);
12123 return NULL;
12124}
12125
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012126PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012127 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128\n\
12129Return a copy of the string S, where all characters have been mapped\n\
12130through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012131Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012132Unmapped characters are left untouched. Characters mapped to None\n\
12133are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
12135static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012138 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139}
12140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012141PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012144Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012145
12146static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012147unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149 return fixup(self, fixupper);
12150}
12151
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012152PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012155Pad a numeric string S with zeros on the left, to fill a field\n\
12156of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157
12158static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012159unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012161 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012162 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012163 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 int kind;
12165 void *data;
12166 Py_UCS4 chr;
12167
12168 if (PyUnicode_READY(self) == -1)
12169 return NULL;
12170
Martin v. Löwis18e16552006-02-15 17:27:45 +000012171 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172 return NULL;
12173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012175 if (PyUnicode_CheckExact(self)) {
12176 Py_INCREF(self);
12177 return (PyObject*) self;
12178 }
12179 else
Victor Stinner2219e0a2011-10-01 01:16:59 +020012180 return PyUnicode_Copy((PyObject*)self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181 }
12182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
12185 u = pad(self, fill, 0, '0');
12186
Walter Dörwald068325e2002-04-15 13:36:47 +000012187 if (u == NULL)
12188 return NULL;
12189
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 kind = PyUnicode_KIND(u);
12191 data = PyUnicode_DATA(u);
12192 chr = PyUnicode_READ(kind, data, fill);
12193
12194 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 PyUnicode_WRITE(kind, data, 0, chr);
12197 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 }
12199
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012200 assert(_PyUnicode_CheckConsistency(u, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 return (PyObject*) u;
12202}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203
12204#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012205static PyObject *
12206unicode__decimal2ascii(PyObject *self)
12207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012209}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012210#endif
12211
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012212PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012213 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012214\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012215Return True if S starts with the specified prefix, False otherwise.\n\
12216With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012217With optional end, stop comparing S at that position.\n\
12218prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219
12220static PyObject *
12221unicode_startswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012222 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012224 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012225 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012226 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012227 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012228 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229
Jesus Ceaac451502011-04-20 17:09:23 +020012230 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012232 if (PyTuple_Check(subobj)) {
12233 Py_ssize_t i;
12234 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12235 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012237 if (substring == NULL)
12238 return NULL;
12239 result = tailmatch(self, substring, start, end, -1);
12240 Py_DECREF(substring);
12241 if (result) {
12242 Py_RETURN_TRUE;
12243 }
12244 }
12245 /* nothing matched */
12246 Py_RETURN_FALSE;
12247 }
12248 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012249 if (substring == NULL) {
12250 if (PyErr_ExceptionMatches(PyExc_TypeError))
12251 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12252 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012253 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012254 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012255 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012257 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258}
12259
12260
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012261PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012262 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012264Return True if S ends with the specified suffix, False otherwise.\n\
12265With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012266With optional end, stop comparing S at that position.\n\
12267suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
12269static PyObject *
12270unicode_endswith(PyUnicodeObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012271 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012272{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012273 PyObject *subobj;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 PyUnicodeObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012275 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012276 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012277 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278
Jesus Ceaac451502011-04-20 17:09:23 +020012279 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012280 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012281 if (PyTuple_Check(subobj)) {
12282 Py_ssize_t i;
12283 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12284 substring = (PyUnicodeObject *)PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012285 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012286 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012287 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012288 result = tailmatch(self, substring, start, end, +1);
12289 Py_DECREF(substring);
12290 if (result) {
12291 Py_RETURN_TRUE;
12292 }
12293 }
12294 Py_RETURN_FALSE;
12295 }
12296 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012297 if (substring == NULL) {
12298 if (PyErr_ExceptionMatches(PyExc_TypeError))
12299 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12300 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012301 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012302 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012303 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012304 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012305 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306}
12307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012309
12310PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012311 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012312\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012313Return a formatted version of S, using substitutions from args and kwargs.\n\
12314The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012315
Eric Smith27bbca62010-11-04 17:06:58 +000012316PyDoc_STRVAR(format_map__doc__,
12317 "S.format_map(mapping) -> str\n\
12318\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012319Return a formatted version of S, using substitutions from mapping.\n\
12320The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012321
Eric Smith4a7d76d2008-05-30 18:10:19 +000012322static PyObject *
12323unicode__format__(PyObject* self, PyObject* args)
12324{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012325 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012326
12327 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12328 return NULL;
12329
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012330 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012332 if (out != NULL)
12333 assert(_PyUnicode_CheckConsistency(out, 1));
12334 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012335}
12336
Eric Smith8c663262007-08-25 02:26:07 +000012337PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012339\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012340Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012341
12342static PyObject *
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012343unicode__sizeof__(PyUnicodeObject *v)
12344{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 Py_ssize_t size;
12346
12347 /* If it's a compact object, account for base structure +
12348 character data. */
12349 if (PyUnicode_IS_COMPACT_ASCII(v))
12350 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12351 else if (PyUnicode_IS_COMPACT(v))
12352 size = sizeof(PyCompactUnicodeObject) +
12353 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12354 else {
12355 /* If it is a two-block object, account for base object, and
12356 for character block if present. */
12357 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012358 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 size += (PyUnicode_GET_LENGTH(v) + 1) *
12360 PyUnicode_CHARACTER_SIZE(v);
12361 }
12362 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012363 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012364 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012365 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012366 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012367 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368
12369 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012370}
12371
12372PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012373 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012374
12375static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012376unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012377{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012378 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379 if (!copy)
12380 return NULL;
12381 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012382}
12383
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384static PyMethodDef unicode_methods[] = {
12385
12386 /* Order is according to common usage: often used methods should
12387 appear first, since lookup is done sequentially. */
12388
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012389 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012390 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12391 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012392 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012393 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12394 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12395 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12396 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12397 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12398 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12399 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012400 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012401 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12402 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12403 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012404 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012405 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12406 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12407 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012408 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012409 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012410 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012411 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012412 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12413 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12414 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12415 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12416 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12417 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12418 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12419 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12420 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12421 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12422 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12423 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12424 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12425 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012426 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012427 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012428 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012429 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012430 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012431 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012432 {"maketrans", (PyCFunction) unicode_maketrans,
12433 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012434 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012435#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012436 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437#endif
12438
12439#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012440 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012441 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442#endif
12443
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445 {NULL, NULL}
12446};
12447
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012448static PyObject *
12449unicode_mod(PyObject *v, PyObject *w)
12450{
Brian Curtindfc80e32011-08-10 20:28:54 -050012451 if (!PyUnicode_Check(v))
12452 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012454}
12455
12456static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 0, /*nb_add*/
12458 0, /*nb_subtract*/
12459 0, /*nb_multiply*/
12460 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012461};
12462
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 (lenfunc) unicode_length, /* sq_length */
12465 PyUnicode_Concat, /* sq_concat */
12466 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12467 (ssizeargfunc) unicode_getitem, /* sq_item */
12468 0, /* sq_slice */
12469 0, /* sq_ass_item */
12470 0, /* sq_ass_slice */
12471 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012472};
12473
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012474static PyObject*
12475unicode_subscript(PyUnicodeObject* self, PyObject* item)
12476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 if (PyUnicode_READY(self) == -1)
12478 return NULL;
12479
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012480 if (PyIndex_Check(item)) {
12481 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012482 if (i == -1 && PyErr_Occurred())
12483 return NULL;
12484 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012485 i += PyUnicode_GET_LENGTH(self);
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012486 return unicode_getitem((PyObject*)self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012487 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012488 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012489 PyObject *result;
12490 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012491 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012492 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012494 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012495 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012496 return NULL;
12497 }
12498
12499 if (slicelength <= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 return PyUnicode_New(0, 0);
12501 } else if (start == 0 && step == 1 &&
12502 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012503 PyUnicode_CheckExact(self)) {
12504 Py_INCREF(self);
12505 return (PyObject *)self;
12506 } else if (step == 1) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012507 return PyUnicode_Substring((PyObject*)self,
12508 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012509 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012510 /* General case */
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012511 max_char = 0;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012512 src_kind = PyUnicode_KIND(self);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012513 kind_limit = kind_maxchar_limit(src_kind);
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012514 src_data = PyUnicode_DATA(self);
12515 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12516 ch = PyUnicode_READ(src_kind, src_data, cur);
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012517 if (ch > max_char) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012518 max_char = ch;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012519 if (max_char >= kind_limit)
12520 break;
12521 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012522 }
12523 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012524 if (result == NULL)
12525 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012526 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012527 dest_data = PyUnicode_DATA(result);
12528
12529 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012530 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12531 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012532 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012533 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012534 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012535 } else {
12536 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12537 return NULL;
12538 }
12539}
12540
12541static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012542 (lenfunc)unicode_length, /* mp_length */
12543 (binaryfunc)unicode_subscript, /* mp_subscript */
12544 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012545};
12546
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548/* Helpers for PyUnicode_Format() */
12549
12550static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012551getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012553 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012555 (*p_argidx)++;
12556 if (arglen < 0)
12557 return args;
12558 else
12559 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560 }
12561 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563 return NULL;
12564}
12565
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012566/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012568static PyObject *
12569formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012571 char *p;
12572 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012573 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012574
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 x = PyFloat_AsDouble(v);
12576 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012577 return NULL;
12578
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012581
Eric Smith0923d1d2009-04-16 20:16:10 +000012582 p = PyOS_double_to_string(x, type, prec,
12583 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012584 if (p == NULL)
12585 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012586 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000012587 PyMem_Free(p);
12588 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589}
12590
Tim Peters38fd5b62000-09-21 05:43:11 +000012591static PyObject*
12592formatlong(PyObject *val, int flags, int prec, int type)
12593{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012594 char *buf;
12595 int len;
12596 PyObject *str; /* temporary string object. */
12597 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012598
Benjamin Peterson14339b62009-01-31 16:36:08 +000012599 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12600 if (!str)
12601 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012602 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012603 Py_DECREF(str);
12604 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000012605}
12606
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012608formatchar(Py_UCS4 *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012609 size_t buflen,
12610 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012612 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012613 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012614 if (PyUnicode_GET_LENGTH(v) == 1) {
12615 buf[0] = PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000012616 buf[1] = '\0';
12617 return 1;
12618 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012619 goto onError;
12620 }
12621 else {
12622 /* Integer input truncated to a character */
12623 long x;
12624 x = PyLong_AsLong(v);
12625 if (x == -1 && PyErr_Occurred())
12626 goto onError;
12627
12628 if (x < 0 || x > 0x10ffff) {
12629 PyErr_SetString(PyExc_OverflowError,
12630 "%c arg not in range(0x110000)");
12631 return -1;
12632 }
12633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 buf[0] = (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012635 buf[1] = '\0';
12636 return 1;
12637 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000012638
Benjamin Peterson29060642009-01-31 22:14:21 +000012639 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012640 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012641 "%c requires int or char");
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000012642 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012643}
12644
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012645/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012646 FORMATBUFLEN is the length of the buffer in which chars are formatted.
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012647*/
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012648#define FORMATBUFLEN (size_t)10
Marc-André Lemburgf28dd832000-06-30 10:29:57 +000012649
Alexander Belopolsky40018472011-02-26 01:02:56 +000012650PyObject *
12651PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 void *fmt;
12654 int fmtkind;
12655 PyObject *result;
12656 Py_UCS4 *res, *res0;
12657 Py_UCS4 max;
12658 int kind;
12659 Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012660 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661 PyObject *dict = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 PyUnicodeObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +000012663
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012665 PyErr_BadInternalCall();
12666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12669 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012670 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 fmt = PyUnicode_DATA(uformat);
12672 fmtkind = PyUnicode_KIND(uformat);
12673 fmtcnt = PyUnicode_GET_LENGTH(uformat);
12674 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
12676 reslen = rescnt = fmtcnt + 100;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12678 if (res0 == NULL) {
12679 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012680 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
12683 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012684 arglen = PyTuple_Size(args);
12685 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686 }
12687 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012688 arglen = -1;
12689 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012690 }
Christian Heimes90aa7642007-12-19 02:45:37 +000012691 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000012692 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000012693 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012694
12695 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012697 if (--rescnt < 0) {
12698 rescnt = fmtcnt + 100;
12699 reslen += rescnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012700 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12701 if (res0 == NULL){
12702 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704 }
12705 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000012706 --rescnt;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012708 *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012709 }
12710 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012711 /* Got a format specifier */
12712 int flags = 0;
12713 Py_ssize_t width = -1;
12714 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 Py_UCS4 c = '\0';
12716 Py_UCS4 fill;
Benjamin Peterson29060642009-01-31 22:14:21 +000012717 int isnumok;
12718 PyObject *v = NULL;
12719 PyObject *temp = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 void *pbuf;
12721 Py_ssize_t pindex;
Benjamin Peterson29060642009-01-31 22:14:21 +000012722 Py_UNICODE sign;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 Py_ssize_t len, len1;
12724 Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 fmtpos++;
12727 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12728 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000012729 Py_ssize_t keylen;
12730 PyObject *key;
12731 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000012732
Benjamin Peterson29060642009-01-31 22:14:21 +000012733 if (dict == NULL) {
12734 PyErr_SetString(PyExc_TypeError,
12735 "format requires a mapping");
12736 goto onError;
12737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012739 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012740 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000012741 /* Skip over balanced parentheses */
12742 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012743 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000012746 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012747 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000012748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 if (fmtcnt < 0 || pcount > 0) {
12751 PyErr_SetString(PyExc_ValueError,
12752 "incomplete format key");
12753 goto onError;
12754 }
Victor Stinner12bab6d2011-10-01 01:53:49 +020012755 key = PyUnicode_Substring((PyObject*)uformat,
12756 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 if (key == NULL)
12758 goto onError;
12759 if (args_owned) {
12760 Py_DECREF(args);
12761 args_owned = 0;
12762 }
12763 args = PyObject_GetItem(dict, key);
12764 Py_DECREF(key);
12765 if (args == NULL) {
12766 goto onError;
12767 }
12768 args_owned = 1;
12769 arglen = -1;
12770 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012772 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012774 case '-': flags |= F_LJUST; continue;
12775 case '+': flags |= F_SIGN; continue;
12776 case ' ': flags |= F_BLANK; continue;
12777 case '#': flags |= F_ALT; continue;
12778 case '0': flags |= F_ZERO; continue;
12779 }
12780 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012781 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012782 if (c == '*') {
12783 v = getnextarg(args, arglen, &argidx);
12784 if (v == NULL)
12785 goto onError;
12786 if (!PyLong_Check(v)) {
12787 PyErr_SetString(PyExc_TypeError,
12788 "* wants int");
12789 goto onError;
12790 }
12791 width = PyLong_AsLong(v);
12792 if (width == -1 && PyErr_Occurred())
12793 goto onError;
12794 if (width < 0) {
12795 flags |= F_LJUST;
12796 width = -width;
12797 }
12798 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012800 }
12801 else if (c >= '0' && c <= '9') {
12802 width = c - '0';
12803 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012805 if (c < '0' || c > '9')
12806 break;
12807 if ((width*10) / 10 != width) {
12808 PyErr_SetString(PyExc_ValueError,
12809 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000012810 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000012811 }
12812 width = width*10 + (c - '0');
12813 }
12814 }
12815 if (c == '.') {
12816 prec = 0;
12817 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 if (c == '*') {
12820 v = getnextarg(args, arglen, &argidx);
12821 if (v == NULL)
12822 goto onError;
12823 if (!PyLong_Check(v)) {
12824 PyErr_SetString(PyExc_TypeError,
12825 "* wants int");
12826 goto onError;
12827 }
12828 prec = PyLong_AsLong(v);
12829 if (prec == -1 && PyErr_Occurred())
12830 goto onError;
12831 if (prec < 0)
12832 prec = 0;
12833 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012834 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012835 }
12836 else if (c >= '0' && c <= '9') {
12837 prec = c - '0';
12838 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012839 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012840 if (c < '0' || c > '9')
12841 break;
12842 if ((prec*10) / 10 != prec) {
12843 PyErr_SetString(PyExc_ValueError,
12844 "prec too big");
12845 goto onError;
12846 }
12847 prec = prec*10 + (c - '0');
12848 }
12849 }
12850 } /* prec */
12851 if (fmtcnt >= 0) {
12852 if (c == 'h' || c == 'l' || c == 'L') {
12853 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000012855 }
12856 }
12857 if (fmtcnt < 0) {
12858 PyErr_SetString(PyExc_ValueError,
12859 "incomplete format");
12860 goto onError;
12861 }
12862 if (c != '%') {
12863 v = getnextarg(args, arglen, &argidx);
12864 if (v == NULL)
12865 goto onError;
12866 }
12867 sign = 0;
12868 fill = ' ';
12869 switch (c) {
12870
12871 case '%':
12872 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012873 kind = PyUnicode_4BYTE_KIND;
Benjamin Peterson29060642009-01-31 22:14:21 +000012874 /* presume that buffer length is at least 1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012875 PyUnicode_WRITE(kind, pbuf, 0, '%');
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 len = 1;
12877 break;
12878
12879 case 's':
12880 case 'r':
12881 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000012882 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012883 temp = v;
12884 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000012885 }
12886 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012887 if (c == 's')
12888 temp = PyObject_Str(v);
12889 else if (c == 'r')
12890 temp = PyObject_Repr(v);
12891 else
12892 temp = PyObject_ASCII(v);
12893 if (temp == NULL)
12894 goto onError;
12895 if (PyUnicode_Check(temp))
12896 /* nothing to do */;
12897 else {
12898 Py_DECREF(temp);
12899 PyErr_SetString(PyExc_TypeError,
12900 "%s argument has non-string str()");
12901 goto onError;
12902 }
12903 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012904 if (PyUnicode_READY(temp) == -1) {
12905 Py_CLEAR(temp);
12906 goto onError;
12907 }
12908 pbuf = PyUnicode_DATA(temp);
12909 kind = PyUnicode_KIND(temp);
12910 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012911 if (prec >= 0 && len > prec)
12912 len = prec;
12913 break;
12914
12915 case 'i':
12916 case 'd':
12917 case 'u':
12918 case 'o':
12919 case 'x':
12920 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000012921 isnumok = 0;
12922 if (PyNumber_Check(v)) {
12923 PyObject *iobj=NULL;
12924
12925 if (PyLong_Check(v)) {
12926 iobj = v;
12927 Py_INCREF(iobj);
12928 }
12929 else {
12930 iobj = PyNumber_Long(v);
12931 }
12932 if (iobj!=NULL) {
12933 if (PyLong_Check(iobj)) {
12934 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070012935 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000012936 Py_DECREF(iobj);
12937 if (!temp)
12938 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 if (PyUnicode_READY(temp) == -1) {
12940 Py_CLEAR(temp);
12941 goto onError;
12942 }
12943 pbuf = PyUnicode_DATA(temp);
12944 kind = PyUnicode_KIND(temp);
12945 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012946 sign = 1;
12947 }
12948 else {
12949 Py_DECREF(iobj);
12950 }
12951 }
12952 }
12953 if (!isnumok) {
12954 PyErr_Format(PyExc_TypeError,
12955 "%%%c format: a number is required, "
12956 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12957 goto onError;
12958 }
12959 if (flags & F_ZERO)
12960 fill = '0';
12961 break;
12962
12963 case 'e':
12964 case 'E':
12965 case 'f':
12966 case 'F':
12967 case 'g':
12968 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012969 temp = formatfloat(v, flags, prec, c);
12970 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000012971 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 if (PyUnicode_READY(temp) == -1) {
12973 Py_CLEAR(temp);
12974 goto onError;
12975 }
12976 pbuf = PyUnicode_DATA(temp);
12977 kind = PyUnicode_KIND(temp);
12978 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000012979 sign = 1;
12980 if (flags & F_ZERO)
12981 fill = '0';
12982 break;
12983
12984 case 'c':
12985 pbuf = formatbuf;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 kind = PyUnicode_4BYTE_KIND;
Victor Stinnerb9dcffb2011-09-29 00:39:24 +020012987 len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 if (len < 0)
12989 goto onError;
12990 break;
12991
12992 default:
12993 PyErr_Format(PyExc_ValueError,
12994 "unsupported format character '%c' (0x%x) "
12995 "at index %zd",
12996 (31<=c && c<=126) ? (char)c : '?',
12997 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000012999 goto onError;
13000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013001 /* pbuf is initialized here. */
13002 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013003 if (sign) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
13005 PyUnicode_READ(kind, pbuf, pindex) == '+') {
13006 sign = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 len--;
13008 }
13009 else if (flags & F_SIGN)
13010 sign = '+';
13011 else if (flags & F_BLANK)
13012 sign = ' ';
13013 else
13014 sign = 0;
13015 }
13016 if (width < len)
13017 width = len;
13018 if (rescnt - (sign != 0) < width) {
13019 reslen -= rescnt;
13020 rescnt = width + fmtcnt + 100;
13021 reslen += rescnt;
13022 if (reslen < 0) {
13023 Py_XDECREF(temp);
13024 PyErr_NoMemory();
13025 goto onError;
13026 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
13028 if (res0 == 0) {
13029 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +000013030 Py_XDECREF(temp);
13031 goto onError;
13032 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 res = res0 + reslen - rescnt;
Benjamin Peterson29060642009-01-31 22:14:21 +000013034 }
13035 if (sign) {
13036 if (fill != ' ')
13037 *res++ = sign;
13038 rescnt--;
13039 if (width > len)
13040 width--;
13041 }
13042 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13044 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013045 if (fill != ' ') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13047 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013048 }
13049 rescnt -= 2;
13050 width -= 2;
13051 if (width < 0)
13052 width = 0;
13053 len -= 2;
13054 }
13055 if (width > len && !(flags & F_LJUST)) {
13056 do {
13057 --rescnt;
13058 *res++ = fill;
13059 } while (--width > len);
13060 }
13061 if (fill == ' ') {
13062 if (sign)
13063 *res++ = sign;
13064 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013065 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13066 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
13067 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13068 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013069 }
13070 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013071 /* Copy all characters, preserving len */
13072 len1 = len;
13073 while (len1--) {
13074 *res++ = PyUnicode_READ(kind, pbuf, pindex++);
13075 rescnt--;
13076 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013077 while (--width >= len) {
13078 --rescnt;
13079 *res++ = ' ';
13080 }
13081 if (dict && (argidx < arglen) && c != '%') {
13082 PyErr_SetString(PyExc_TypeError,
13083 "not all arguments converted during string formatting");
Thomas Woutersa96affe2006-03-12 00:29:36 +000013084 Py_XDECREF(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 goto onError;
13086 }
13087 Py_XDECREF(temp);
13088 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089 } /* until end */
13090 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013091 PyErr_SetString(PyExc_TypeError,
13092 "not all arguments converted during string formatting");
13093 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094 }
13095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096
13097 for (max=0, res = res0; res < res0+reslen-rescnt; res++)
13098 if (*res > max)
13099 max = *res;
13100 result = PyUnicode_New(reslen - rescnt, max);
13101 if (!result)
Benjamin Peterson29060642009-01-31 22:14:21 +000013102 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 kind = PyUnicode_KIND(result);
13104 for (res = res0; res < res0+reslen-rescnt; res++)
13105 PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
13106 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109 }
13110 Py_DECREF(uformat);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013111 assert(_PyUnicode_CheckConsistency(result, 1));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112 return (PyObject *)result;
13113
Benjamin Peterson29060642009-01-31 22:14:21 +000013114 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 PyMem_Free(res0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116 Py_DECREF(uformat);
13117 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013118 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013119 }
13120 return NULL;
13121}
13122
Jeremy Hylton938ace62002-07-17 16:30:39 +000013123static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013124unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13125
Tim Peters6d6c1a32001-08-02 04:15:00 +000013126static PyObject *
13127unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13128{
Benjamin Peterson29060642009-01-31 22:14:21 +000013129 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013130 static char *kwlist[] = {"object", "encoding", "errors", 0};
13131 char *encoding = NULL;
13132 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013133
Benjamin Peterson14339b62009-01-31 16:36:08 +000013134 if (type != &PyUnicode_Type)
13135 return unicode_subtype_new(type, args, kwds);
13136 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013138 return NULL;
13139 if (x == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 return (PyObject *)PyUnicode_New(0, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013141 if (encoding == NULL && errors == NULL)
13142 return PyObject_Str(x);
13143 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013144 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013145}
13146
Guido van Rossume023fe02001-08-30 03:12:59 +000013147static PyObject *
13148unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13149{
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013150 PyUnicodeObject *unicode, *self;
13151 Py_ssize_t length, char_size;
13152 int share_wstr, share_utf8;
13153 unsigned int kind;
13154 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013155
Benjamin Peterson14339b62009-01-31 16:36:08 +000013156 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013157
13158 unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13159 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013160 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013161 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013162 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013163 return NULL;
13164
13165 self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13166 if (self == NULL) {
13167 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013168 return NULL;
13169 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013170 kind = PyUnicode_KIND(unicode);
13171 length = PyUnicode_GET_LENGTH(unicode);
13172
13173 _PyUnicode_LENGTH(self) = length;
13174 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13175 _PyUnicode_STATE(self).interned = 0;
13176 _PyUnicode_STATE(self).kind = kind;
13177 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013178 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013179 _PyUnicode_STATE(self).ready = 1;
13180 _PyUnicode_WSTR(self) = NULL;
13181 _PyUnicode_UTF8_LENGTH(self) = 0;
13182 _PyUnicode_UTF8(self) = NULL;
13183 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013184 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013185
13186 share_utf8 = 0;
13187 share_wstr = 0;
13188 if (kind == PyUnicode_1BYTE_KIND) {
13189 char_size = 1;
13190 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13191 share_utf8 = 1;
13192 }
13193 else if (kind == PyUnicode_2BYTE_KIND) {
13194 char_size = 2;
13195 if (sizeof(wchar_t) == 2)
13196 share_wstr = 1;
13197 }
13198 else {
13199 assert(kind == PyUnicode_4BYTE_KIND);
13200 char_size = 4;
13201 if (sizeof(wchar_t) == 4)
13202 share_wstr = 1;
13203 }
13204
13205 /* Ensure we won't overflow the length. */
13206 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13207 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013208 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013210 data = PyObject_MALLOC((length + 1) * char_size);
13211 if (data == NULL) {
13212 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 goto onError;
13214 }
13215
Victor Stinnerc3c74152011-10-02 20:39:55 +020013216 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013217 if (share_utf8) {
13218 _PyUnicode_UTF8_LENGTH(self) = length;
13219 _PyUnicode_UTF8(self) = data;
13220 }
13221 if (share_wstr) {
13222 _PyUnicode_WSTR_LENGTH(self) = length;
13223 _PyUnicode_WSTR(self) = (wchar_t *)data;
13224 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013225
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013226 Py_MEMCPY(data, PyUnicode_DATA(unicode),
13227 PyUnicode_KIND_SIZE(kind, length + 1));
13228 Py_DECREF(unicode);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013229 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013230 return (PyObject *)self;
13231
13232onError:
13233 Py_DECREF(unicode);
13234 Py_DECREF(self);
13235 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013236}
13237
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013238PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013239 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013240\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013241Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013242encoding defaults to the current default string encoding.\n\
13243errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013244
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013245static PyObject *unicode_iter(PyObject *seq);
13246
Guido van Rossumd57fd912000-03-10 22:53:23 +000013247PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013248 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013249 "str", /* tp_name */
13250 sizeof(PyUnicodeObject), /* tp_size */
13251 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013252 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013253 (destructor)unicode_dealloc, /* tp_dealloc */
13254 0, /* tp_print */
13255 0, /* tp_getattr */
13256 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013257 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013258 unicode_repr, /* tp_repr */
13259 &unicode_as_number, /* tp_as_number */
13260 &unicode_as_sequence, /* tp_as_sequence */
13261 &unicode_as_mapping, /* tp_as_mapping */
13262 (hashfunc) unicode_hash, /* tp_hash*/
13263 0, /* tp_call*/
13264 (reprfunc) unicode_str, /* tp_str */
13265 PyObject_GenericGetAttr, /* tp_getattro */
13266 0, /* tp_setattro */
13267 0, /* tp_as_buffer */
13268 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013269 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013270 unicode_doc, /* tp_doc */
13271 0, /* tp_traverse */
13272 0, /* tp_clear */
13273 PyUnicode_RichCompare, /* tp_richcompare */
13274 0, /* tp_weaklistoffset */
13275 unicode_iter, /* tp_iter */
13276 0, /* tp_iternext */
13277 unicode_methods, /* tp_methods */
13278 0, /* tp_members */
13279 0, /* tp_getset */
13280 &PyBaseObject_Type, /* tp_base */
13281 0, /* tp_dict */
13282 0, /* tp_descr_get */
13283 0, /* tp_descr_set */
13284 0, /* tp_dictoffset */
13285 0, /* tp_init */
13286 0, /* tp_alloc */
13287 unicode_new, /* tp_new */
13288 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289};
13290
13291/* Initialize the Unicode implementation */
13292
Thomas Wouters78890102000-07-22 19:25:51 +000013293void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013294{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013295 int i;
13296
Thomas Wouters477c8d52006-05-27 19:21:47 +000013297 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013299 0x000A, /* LINE FEED */
13300 0x000D, /* CARRIAGE RETURN */
13301 0x001C, /* FILE SEPARATOR */
13302 0x001D, /* GROUP SEPARATOR */
13303 0x001E, /* RECORD SEPARATOR */
13304 0x0085, /* NEXT LINE */
13305 0x2028, /* LINE SEPARATOR */
13306 0x2029, /* PARAGRAPH SEPARATOR */
13307 };
13308
Fred Drakee4315f52000-05-09 19:53:39 +000013309 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013310 unicode_empty = PyUnicode_New(0, 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013311 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013312 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013313 Py_FatalError("Can't create empty string");
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013314
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013315 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013316 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013317 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013319
13320 /* initialize the linebreak bloom filter */
13321 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013322 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013323 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013324
13325 PyType_Ready(&EncodingMapType);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326}
13327
13328/* Finalize the Unicode implementation */
13329
Christian Heimesa156e092008-02-16 07:38:31 +000013330int
13331PyUnicode_ClearFreeList(void)
13332{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013334}
13335
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336void
Thomas Wouters78890102000-07-22 19:25:51 +000013337_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013339 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013340
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013341 Py_XDECREF(unicode_empty);
13342 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013343
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013344 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013345 if (unicode_latin1[i]) {
13346 Py_DECREF(unicode_latin1[i]);
13347 unicode_latin1[i] = NULL;
13348 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013349 }
Christian Heimesa156e092008-02-16 07:38:31 +000013350 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013351}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013352
Walter Dörwald16807132007-05-25 13:52:07 +000013353void
13354PyUnicode_InternInPlace(PyObject **p)
13355{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013356 register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13357 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013358#ifdef Py_DEBUG
13359 assert(s != NULL);
13360 assert(_PyUnicode_CHECK(s));
13361#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013362 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013363 return;
13364#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013365 /* If it's a subclass, we don't really know what putting
13366 it in the interned dict might do. */
13367 if (!PyUnicode_CheckExact(s))
13368 return;
13369 if (PyUnicode_CHECK_INTERNED(s))
13370 return;
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013371 if (_PyUnicode_READY_REPLACE(p)) {
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013372 assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013373 return;
13374 }
Victor Stinner1b4f9ce2011-10-03 13:28:14 +020013375 s = (PyUnicodeObject *)(*p);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013376 if (interned == NULL) {
13377 interned = PyDict_New();
13378 if (interned == NULL) {
13379 PyErr_Clear(); /* Don't leave an exception */
13380 return;
13381 }
13382 }
13383 /* It might be that the GetItem call fails even
13384 though the key is present in the dictionary,
13385 namely when this happens during a stack overflow. */
13386 Py_ALLOW_RECURSION
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 t = PyDict_GetItem(interned, (PyObject *)s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013388 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013389
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 if (t) {
13391 Py_INCREF(t);
13392 Py_DECREF(*p);
13393 *p = t;
13394 return;
13395 }
Walter Dörwald16807132007-05-25 13:52:07 +000013396
Benjamin Peterson14339b62009-01-31 16:36:08 +000013397 PyThreadState_GET()->recursion_critical = 1;
13398 if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13399 PyErr_Clear();
13400 PyThreadState_GET()->recursion_critical = 0;
13401 return;
13402 }
13403 PyThreadState_GET()->recursion_critical = 0;
13404 /* The two references in interned are not counted by refcnt.
13405 The deallocator will take care of this */
13406 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013407 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013408}
13409
13410void
13411PyUnicode_InternImmortal(PyObject **p)
13412{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013413 PyUnicodeObject *u = (PyUnicodeObject *)*p;
13414
Benjamin Peterson14339b62009-01-31 16:36:08 +000013415 PyUnicode_InternInPlace(p);
13416 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013417 _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013418 Py_INCREF(*p);
13419 }
Walter Dörwald16807132007-05-25 13:52:07 +000013420}
13421
13422PyObject *
13423PyUnicode_InternFromString(const char *cp)
13424{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013425 PyObject *s = PyUnicode_FromString(cp);
13426 if (s == NULL)
13427 return NULL;
13428 PyUnicode_InternInPlace(&s);
13429 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013430}
13431
Alexander Belopolsky40018472011-02-26 01:02:56 +000013432void
13433_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013434{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013435 PyObject *keys;
13436 PyUnicodeObject *s;
13437 Py_ssize_t i, n;
13438 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013439
Benjamin Peterson14339b62009-01-31 16:36:08 +000013440 if (interned == NULL || !PyDict_Check(interned))
13441 return;
13442 keys = PyDict_Keys(interned);
13443 if (keys == NULL || !PyList_Check(keys)) {
13444 PyErr_Clear();
13445 return;
13446 }
Walter Dörwald16807132007-05-25 13:52:07 +000013447
Benjamin Peterson14339b62009-01-31 16:36:08 +000013448 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13449 detector, interned unicode strings are not forcibly deallocated;
13450 rather, we give them their stolen references back, and then clear
13451 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013452
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 n = PyList_GET_SIZE(keys);
13454 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013456 for (i = 0; i < n; i++) {
13457 s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013458 if (PyUnicode_READY(s) == -1) {
13459 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013460 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013462 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013463 case SSTATE_NOT_INTERNED:
13464 /* XXX Shouldn't happen */
13465 break;
13466 case SSTATE_INTERNED_IMMORTAL:
13467 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013468 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013469 break;
13470 case SSTATE_INTERNED_MORTAL:
13471 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013473 break;
13474 default:
13475 Py_FatalError("Inconsistent interned string state.");
13476 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013477 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013478 }
13479 fprintf(stderr, "total size of all interned strings: "
13480 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13481 "mortal/immortal\n", mortal_size, immortal_size);
13482 Py_DECREF(keys);
13483 PyDict_Clear(interned);
13484 Py_DECREF(interned);
13485 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013486}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013487
13488
13489/********************* Unicode Iterator **************************/
13490
13491typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013492 PyObject_HEAD
13493 Py_ssize_t it_index;
13494 PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013495} unicodeiterobject;
13496
13497static void
13498unicodeiter_dealloc(unicodeiterobject *it)
13499{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013500 _PyObject_GC_UNTRACK(it);
13501 Py_XDECREF(it->it_seq);
13502 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013503}
13504
13505static int
13506unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13507{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013508 Py_VISIT(it->it_seq);
13509 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013510}
13511
13512static PyObject *
13513unicodeiter_next(unicodeiterobject *it)
13514{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013515 PyUnicodeObject *seq;
13516 PyObject *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013517
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 assert(it != NULL);
13519 seq = it->it_seq;
13520 if (seq == NULL)
13521 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013522 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013523
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13525 int kind = PyUnicode_KIND(seq);
13526 void *data = PyUnicode_DATA(seq);
13527 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13528 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013529 if (item != NULL)
13530 ++it->it_index;
13531 return item;
13532 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013533
Benjamin Peterson14339b62009-01-31 16:36:08 +000013534 Py_DECREF(seq);
13535 it->it_seq = NULL;
13536 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013537}
13538
13539static PyObject *
13540unicodeiter_len(unicodeiterobject *it)
13541{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013542 Py_ssize_t len = 0;
13543 if (it->it_seq)
13544 len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13545 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013546}
13547
13548PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13549
13550static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013551 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000013552 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000013553 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013554};
13555
13556PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013557 PyVarObject_HEAD_INIT(&PyType_Type, 0)
13558 "str_iterator", /* tp_name */
13559 sizeof(unicodeiterobject), /* tp_basicsize */
13560 0, /* tp_itemsize */
13561 /* methods */
13562 (destructor)unicodeiter_dealloc, /* tp_dealloc */
13563 0, /* tp_print */
13564 0, /* tp_getattr */
13565 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013566 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013567 0, /* tp_repr */
13568 0, /* tp_as_number */
13569 0, /* tp_as_sequence */
13570 0, /* tp_as_mapping */
13571 0, /* tp_hash */
13572 0, /* tp_call */
13573 0, /* tp_str */
13574 PyObject_GenericGetAttr, /* tp_getattro */
13575 0, /* tp_setattro */
13576 0, /* tp_as_buffer */
13577 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13578 0, /* tp_doc */
13579 (traverseproc)unicodeiter_traverse, /* tp_traverse */
13580 0, /* tp_clear */
13581 0, /* tp_richcompare */
13582 0, /* tp_weaklistoffset */
13583 PyObject_SelfIter, /* tp_iter */
13584 (iternextfunc)unicodeiter_next, /* tp_iternext */
13585 unicodeiter_methods, /* tp_methods */
13586 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013587};
13588
13589static PyObject *
13590unicode_iter(PyObject *seq)
13591{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013592 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013593
Benjamin Peterson14339b62009-01-31 16:36:08 +000013594 if (!PyUnicode_Check(seq)) {
13595 PyErr_BadInternalCall();
13596 return NULL;
13597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 if (PyUnicode_READY(seq) == -1)
13599 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013600 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13601 if (it == NULL)
13602 return NULL;
13603 it->it_index = 0;
13604 Py_INCREF(seq);
13605 it->it_seq = (PyUnicodeObject *)seq;
13606 _PyObject_GC_TRACK(it);
13607 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013608}
13609
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013610#define UNIOP(x) Py_UNICODE_##x
13611#define UNIOP_t Py_UNICODE
13612#include "uniops.h"
13613#undef UNIOP
13614#undef UNIOP_t
13615#define UNIOP(x) Py_UCS4_##x
13616#define UNIOP_t Py_UCS4
13617#include "uniops.h"
13618#undef UNIOP
13619#undef UNIOP_t
Victor Stinner331ea922010-08-10 16:37:20 +000013620
Victor Stinner71133ff2010-09-01 23:43:53 +000013621Py_UNICODE*
Victor Stinner46408602010-09-03 16:18:00 +000013622PyUnicode_AsUnicodeCopy(PyObject *object)
Victor Stinner71133ff2010-09-01 23:43:53 +000013623{
13624 PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13625 Py_UNICODE *copy;
13626 Py_ssize_t size;
13627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013628 if (!PyUnicode_Check(unicode)) {
13629 PyErr_BadArgument();
13630 return NULL;
13631 }
Victor Stinner71133ff2010-09-01 23:43:53 +000013632 /* Ensure we won't overflow the size. */
13633 if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13634 PyErr_NoMemory();
13635 return NULL;
13636 }
13637 size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13638 size *= sizeof(Py_UNICODE);
13639 copy = PyMem_Malloc(size);
13640 if (copy == NULL) {
13641 PyErr_NoMemory();
13642 return NULL;
13643 }
13644 memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13645 return copy;
13646}
Martin v. Löwis5b222132007-06-10 09:51:05 +000013647
Georg Brandl66c221e2010-10-14 07:04:07 +000013648/* A _string module, to export formatter_parser and formatter_field_name_split
13649 to the string.Formatter class implemented in Python. */
13650
13651static PyMethodDef _string_methods[] = {
13652 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13653 METH_O, PyDoc_STR("split the argument as a field name")},
13654 {"formatter_parser", (PyCFunction) formatter_parser,
13655 METH_O, PyDoc_STR("parse the argument as a format string")},
13656 {NULL, NULL}
13657};
13658
13659static struct PyModuleDef _string_module = {
13660 PyModuleDef_HEAD_INIT,
13661 "_string",
13662 PyDoc_STR("string helper module"),
13663 0,
13664 _string_methods,
13665 NULL,
13666 NULL,
13667 NULL,
13668 NULL
13669};
13670
13671PyMODINIT_FUNC
13672PyInit__string(void)
13673{
13674 return PyModule_Create(&_string_module);
13675}
13676
13677
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013678#ifdef __cplusplus
13679}
13680#endif