blob: 591c81b8481bc8d68d5be92682199b6764cd6525 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000044
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000045#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000046#include <windows.h>
47#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000048
Guido van Rossumd57fd912000-03-10 22:53:23 +000049/* Endianness switches; defaults to little endian */
50
51#ifdef WORDS_BIGENDIAN
52# define BYTEORDER_IS_BIG_ENDIAN
53#else
54# define BYTEORDER_IS_LITTLE_ENDIAN
55#endif
56
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000057/* --- Globals ------------------------------------------------------------
58
59 The globals are initialized by the _PyUnicode_Init() API and should
60 not be used before calling that API.
61
62*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000063
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000064
65#ifdef __cplusplus
66extern "C" {
67#endif
68
Victor Stinner8faf8212011-12-08 22:14:11 +010069/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
Victor Stinner910337b2011-10-03 03:20:16 +020072#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020073# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020074#else
75# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020077
Victor Stinnere90fe6a2011-10-01 16:48:13 +020078#define _PyUnicode_UTF8(op) \
79 (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020081 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020082 assert(PyUnicode_IS_READY(op)), \
83 PyUnicode_IS_COMPACT_ASCII(op) ? \
84 ((char*)((PyASCIIObject*)(op) + 1)) : \
85 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020086#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020087 (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020089 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020090 assert(PyUnicode_IS_READY(op)), \
91 PyUnicode_IS_COMPACT_ASCII(op) ? \
92 ((PyASCIIObject*)(op))->length : \
93 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020094#define _PyUnicode_WSTR(op) \
95 (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op) \
97 (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op) \
99 (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op) \
101 (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op) \
103 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200104#define _PyUnicode_KIND(op) \
105 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200106 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#define _PyUnicode_GET_LENGTH(op) \
108 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200109 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200110#define _PyUnicode_DATA_ANY(op) \
111 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200112
Victor Stinner910337b2011-10-03 03:20:16 +0200113#undef PyUnicode_READY
114#define PyUnicode_READY(op) \
115 (assert(_PyUnicode_CHECK(op)), \
116 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200117 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100118 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200119
Victor Stinnerc379ead2011-10-03 12:52:27 +0200120#define _PyUnicode_SHARE_UTF8(op) \
121 (assert(_PyUnicode_CHECK(op)), \
122 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
123 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op) \
125 (assert(_PyUnicode_CHECK(op)), \
126 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
Victor Stinner829c0ad2011-10-03 01:08:02 +0200128/* true if the Unicode object has an allocated UTF-8 memory block
129 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_HAS_UTF8_MEMORY(op) \
131 (assert(_PyUnicode_CHECK(op)), \
132 (!PyUnicode_IS_COMPACT_ASCII(op) \
133 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200134 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
135
Victor Stinner03490912011-10-03 23:45:12 +0200136/* true if the Unicode object has an allocated wstr memory block
137 (not shared with other data) */
138#define _PyUnicode_HAS_WSTR_MEMORY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (_PyUnicode_WSTR(op) && \
141 (!PyUnicode_IS_READY(op) || \
142 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
143
Victor Stinner910337b2011-10-03 03:20:16 +0200144/* Generic helper macro to convert characters of different types.
145 from_type and to_type have to be valid type names, begin and end
146 are pointers to the source characters which should be of type
147 "from_type *". to is a pointer of type "to_type *" and points to the
148 buffer where the result characters are written to. */
149#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
150 do { \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 to_type *_to = (to_type *) to; \
152 const from_type *_iter = (begin); \
153 const from_type *_end = (end); \
154 Py_ssize_t n = (_end) - (_iter); \
155 const from_type *_unrolled_end = \
156 _iter + (n & ~ (Py_ssize_t) 3); \
157 while (_iter < (_unrolled_end)) { \
158 _to[0] = (to_type) _iter[0]; \
159 _to[1] = (to_type) _iter[1]; \
160 _to[2] = (to_type) _iter[2]; \
161 _to[3] = (to_type) _iter[3]; \
162 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200164 while (_iter < (_end)) \
165 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200166 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200167
Victor Stinnerb15d4d82011-09-28 23:59:20 +0200168/* The Unicode string has been modified: reset the hash */
169#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
170
Walter Dörwald16807132007-05-25 13:52:07 +0000171/* This dictionary holds all interned unicode strings. Note that references
172 to strings in this dictionary are *not* counted in the string's ob_refcnt.
173 When the interned string reaches a refcnt of 0 the string deallocation
174 function will delete the reference from this dictionary.
175
176 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000177 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000178*/
179static PyObject *interned;
180
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181/* The empty Unicode object is shared to improve performance. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200182static PyObject *unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000183
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200184/* List of static strings. */
185static _Py_Identifier *static_strings;
186
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000187/* Single character Unicode strings in the Latin-1 range are being
188 shared as well. */
Victor Stinnera464fc12011-10-02 20:39:30 +0200189static PyObject *unicode_latin1[256];
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Christian Heimes190d79e2008-01-30 11:58:22 +0000191/* Fast detection of the most frequent whitespace characters */
192const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000193 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000194/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000195/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000196/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000197/* case 0x000C: * FORM FEED */
198/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000199 0, 1, 1, 1, 1, 1, 0, 0,
200 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000201/* case 0x001C: * FILE SEPARATOR */
202/* case 0x001D: * GROUP SEPARATOR */
203/* case 0x001E: * RECORD SEPARATOR */
204/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000205 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000206/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000207 1, 0, 0, 0, 0, 0, 0, 0,
208 0, 0, 0, 0, 0, 0, 0, 0,
209 0, 0, 0, 0, 0, 0, 0, 0,
210 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000211
Benjamin Peterson14339b62009-01-31 16:36:08 +0000212 0, 0, 0, 0, 0, 0, 0, 0,
213 0, 0, 0, 0, 0, 0, 0, 0,
214 0, 0, 0, 0, 0, 0, 0, 0,
215 0, 0, 0, 0, 0, 0, 0, 0,
216 0, 0, 0, 0, 0, 0, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
218 0, 0, 0, 0, 0, 0, 0, 0,
219 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000220};
221
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200222/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200223static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200224static PyObject* get_latin1_char(unsigned char ch);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200225static void copy_characters(
226 PyObject *to, Py_ssize_t to_start,
227 PyObject *from, Py_ssize_t from_start,
228 Py_ssize_t how_many);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200229
Alexander Belopolsky40018472011-02-26 01:02:56 +0000230static PyObject *
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200231unicode_fromascii(const unsigned char *s, Py_ssize_t size);
232static PyObject *
233_PyUnicode_FromUCS1(const unsigned char *s, Py_ssize_t size);
234static PyObject *
235_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
236static PyObject *
237_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
238
239static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000240unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000241 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100242 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000243 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static void
246raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300247 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100248 PyObject *unicode,
249 Py_ssize_t startpos, Py_ssize_t endpos,
250 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000251
Christian Heimes190d79e2008-01-30 11:58:22 +0000252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000254 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000255/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000256/* 0x000B, * LINE TABULATION */
257/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000258/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000259 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000260 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000261/* 0x001C, * FILE SEPARATOR */
262/* 0x001D, * GROUP SEPARATOR */
263/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000264 0, 0, 0, 0, 1, 1, 1, 0,
265 0, 0, 0, 0, 0, 0, 0, 0,
266 0, 0, 0, 0, 0, 0, 0, 0,
267 0, 0, 0, 0, 0, 0, 0, 0,
268 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000269
Benjamin Peterson14339b62009-01-31 16:36:08 +0000270 0, 0, 0, 0, 0, 0, 0, 0,
271 0, 0, 0, 0, 0, 0, 0, 0,
272 0, 0, 0, 0, 0, 0, 0, 0,
273 0, 0, 0, 0, 0, 0, 0, 0,
274 0, 0, 0, 0, 0, 0, 0, 0,
275 0, 0, 0, 0, 0, 0, 0, 0,
276 0, 0, 0, 0, 0, 0, 0, 0,
277 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000278};
279
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000282Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000283PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000284{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000285#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000286 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000287#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 /* This is actually an illegal character, so it should
289 not be passed to unichr. */
290 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000291#endif
292}
293
Victor Stinner910337b2011-10-03 03:20:16 +0200294#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200295int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100296_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200297{
298 PyASCIIObject *ascii;
299 unsigned int kind;
300
301 assert(PyUnicode_Check(op));
302
303 ascii = (PyASCIIObject *)op;
304 kind = ascii->state.kind;
305
Victor Stinnera3b334d2011-10-03 13:53:37 +0200306 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200307 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200308 assert(ascii->state.ready == 1);
309 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200310 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200311 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200312 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200313
Victor Stinnera41463c2011-10-04 01:05:08 +0200314 if (ascii->state.compact == 1) {
315 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200316 assert(kind == PyUnicode_1BYTE_KIND
317 || kind == PyUnicode_2BYTE_KIND
318 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200319 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200321 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100322 }
323 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200324 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
325
326 data = unicode->data.any;
327 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100328 assert(ascii->length == 0);
329 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200330 assert(ascii->state.compact == 0);
331 assert(ascii->state.ascii == 0);
332 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100333 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->wstr != NULL);
335 assert(data == NULL);
336 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 }
338 else {
339 assert(kind == PyUnicode_1BYTE_KIND
340 || kind == PyUnicode_2BYTE_KIND
341 || kind == PyUnicode_4BYTE_KIND);
342 assert(ascii->state.compact == 0);
343 assert(ascii->state.ready == 1);
344 assert(data != NULL);
345 if (ascii->state.ascii) {
346 assert (compact->utf8 == data);
347 assert (compact->utf8_length == ascii->length);
348 }
349 else
350 assert (compact->utf8 != data);
351 }
352 }
353 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200354 if (
355#if SIZEOF_WCHAR_T == 2
356 kind == PyUnicode_2BYTE_KIND
357#else
358 kind == PyUnicode_4BYTE_KIND
359#endif
360 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200361 {
362 assert(ascii->wstr == data);
363 assert(compact->wstr_length == ascii->length);
364 } else
365 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200366 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200367
368 if (compact->utf8 == NULL)
369 assert(compact->utf8_length == 0);
370 if (ascii->wstr == NULL)
371 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200372 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200373 /* check that the best kind is used */
374 if (check_content && kind != PyUnicode_WCHAR_KIND)
375 {
376 Py_ssize_t i;
377 Py_UCS4 maxchar = 0;
378 void *data = PyUnicode_DATA(ascii);
379 for (i=0; i < ascii->length; i++)
380 {
381 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
382 if (ch > maxchar)
383 maxchar = ch;
384 }
385 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100386 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200387 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100388 assert(maxchar <= 255);
389 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200390 else
391 assert(maxchar < 128);
392 }
Victor Stinner77faf692011-11-20 18:56:05 +0100393 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200394 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100395 assert(maxchar <= 0xFFFF);
396 }
397 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100399 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100400 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200401 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400402 return 1;
403}
Victor Stinner910337b2011-10-03 03:20:16 +0200404#endif
405
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100406static PyObject*
407unicode_result_wchar(PyObject *unicode)
408{
409#ifndef Py_DEBUG
410 Py_ssize_t len;
411
412 assert(Py_REFCNT(unicode) == 1);
413
414 len = _PyUnicode_WSTR_LENGTH(unicode);
415 if (len == 0) {
416 Py_INCREF(unicode_empty);
417 Py_DECREF(unicode);
418 return unicode_empty;
419 }
420
421 if (len == 1) {
422 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
423 if (ch < 256) {
424 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
425 Py_DECREF(unicode);
426 return latin1_char;
427 }
428 }
429
430 if (_PyUnicode_Ready(unicode) < 0) {
431 Py_XDECREF(unicode);
432 return NULL;
433 }
434#else
435 /* don't make the result ready in debug mode to ensure that the caller
436 makes the string ready before using it */
437 assert(_PyUnicode_CheckConsistency(unicode, 1));
438#endif
439 return unicode;
440}
441
442static PyObject*
443unicode_result_ready(PyObject *unicode)
444{
445 Py_ssize_t length;
446
447 length = PyUnicode_GET_LENGTH(unicode);
448 if (length == 0) {
449 if (unicode != unicode_empty) {
450 Py_INCREF(unicode_empty);
451 Py_DECREF(unicode);
452 }
453 return unicode_empty;
454 }
455
456 if (length == 1) {
457 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
458 if (ch < 256) {
459 PyObject *latin1_char = unicode_latin1[ch];
460 if (latin1_char != NULL) {
461 if (unicode != latin1_char) {
462 Py_INCREF(latin1_char);
463 Py_DECREF(unicode);
464 }
465 return latin1_char;
466 }
467 else {
468 assert(_PyUnicode_CheckConsistency(unicode, 1));
469 Py_INCREF(unicode);
470 unicode_latin1[ch] = unicode;
471 return unicode;
472 }
473 }
474 }
475
476 assert(_PyUnicode_CheckConsistency(unicode, 1));
477 return unicode;
478}
479
480static PyObject*
481unicode_result(PyObject *unicode)
482{
483 assert(_PyUnicode_CHECK(unicode));
484 if (PyUnicode_IS_READY(unicode))
485 return unicode_result_ready(unicode);
486 else
487 return unicode_result_wchar(unicode);
488}
489
Victor Stinner3a50e702011-10-18 21:21:00 +0200490#ifdef HAVE_MBCS
491static OSVERSIONINFOEX winver;
492#endif
493
Thomas Wouters477c8d52006-05-27 19:21:47 +0000494/* --- Bloom Filters ----------------------------------------------------- */
495
496/* stuff to implement simple "bloom filters" for Unicode characters.
497 to keep things simple, we use a single bitmask, using the least 5
498 bits from each unicode characters as the bit index. */
499
500/* the linebreak mask is set up by Unicode_Init below */
501
Antoine Pitrouf068f942010-01-13 14:19:12 +0000502#if LONG_BIT >= 128
503#define BLOOM_WIDTH 128
504#elif LONG_BIT >= 64
505#define BLOOM_WIDTH 64
506#elif LONG_BIT >= 32
507#define BLOOM_WIDTH 32
508#else
509#error "LONG_BIT is smaller than 32"
510#endif
511
Thomas Wouters477c8d52006-05-27 19:21:47 +0000512#define BLOOM_MASK unsigned long
513
514static BLOOM_MASK bloom_linebreak;
515
Antoine Pitrouf068f942010-01-13 14:19:12 +0000516#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
517#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000518
Benjamin Peterson29060642009-01-31 22:14:21 +0000519#define BLOOM_LINEBREAK(ch) \
520 ((ch) < 128U ? ascii_linebreak[(ch)] : \
521 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000522
Alexander Belopolsky40018472011-02-26 01:02:56 +0000523Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200524make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000525{
526 /* calculate simple bloom-style bitmask for a given unicode string */
527
Antoine Pitrouf068f942010-01-13 14:19:12 +0000528 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000529 Py_ssize_t i;
530
531 mask = 0;
532 for (i = 0; i < len; i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200533 BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
Thomas Wouters477c8d52006-05-27 19:21:47 +0000534
535 return mask;
536}
537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200538#define BLOOM_MEMBER(mask, chr, str) \
539 (BLOOM(mask, chr) \
540 && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000541
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200542/* Compilation of templated routines */
543
544#include "stringlib/asciilib.h"
545#include "stringlib/fastsearch.h"
546#include "stringlib/partition.h"
547#include "stringlib/split.h"
548#include "stringlib/count.h"
549#include "stringlib/find.h"
550#include "stringlib/find_max_char.h"
551#include "stringlib/localeutil.h"
552#include "stringlib/undef.h"
553
554#include "stringlib/ucs1lib.h"
555#include "stringlib/fastsearch.h"
556#include "stringlib/partition.h"
557#include "stringlib/split.h"
558#include "stringlib/count.h"
559#include "stringlib/find.h"
560#include "stringlib/find_max_char.h"
561#include "stringlib/localeutil.h"
562#include "stringlib/undef.h"
563
564#include "stringlib/ucs2lib.h"
565#include "stringlib/fastsearch.h"
566#include "stringlib/partition.h"
567#include "stringlib/split.h"
568#include "stringlib/count.h"
569#include "stringlib/find.h"
570#include "stringlib/find_max_char.h"
571#include "stringlib/localeutil.h"
572#include "stringlib/undef.h"
573
574#include "stringlib/ucs4lib.h"
575#include "stringlib/fastsearch.h"
576#include "stringlib/partition.h"
577#include "stringlib/split.h"
578#include "stringlib/count.h"
579#include "stringlib/find.h"
580#include "stringlib/find_max_char.h"
581#include "stringlib/localeutil.h"
582#include "stringlib/undef.h"
583
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200584#include "stringlib/unicodedefs.h"
585#include "stringlib/fastsearch.h"
586#include "stringlib/count.h"
587#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100588#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590/* --- Unicode Object ----------------------------------------------------- */
591
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200592static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200593fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200594
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200595Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
596 Py_ssize_t size, Py_UCS4 ch,
597 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200598{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200599 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
600
601 switch (kind) {
602 case PyUnicode_1BYTE_KIND:
603 {
604 Py_UCS1 ch1 = (Py_UCS1) ch;
605 if (ch1 == ch)
606 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
607 else
608 return -1;
609 }
610 case PyUnicode_2BYTE_KIND:
611 {
612 Py_UCS2 ch2 = (Py_UCS2) ch;
613 if (ch2 == ch)
614 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
615 else
616 return -1;
617 }
618 case PyUnicode_4BYTE_KIND:
619 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
620 default:
621 assert(0);
622 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200623 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200624}
625
Victor Stinnerfe226c02011-10-03 03:52:20 +0200626static PyObject*
627resize_compact(PyObject *unicode, Py_ssize_t length)
628{
629 Py_ssize_t char_size;
630 Py_ssize_t struct_size;
631 Py_ssize_t new_size;
632 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100633 PyObject *new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200634
635 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200636 char_size = PyUnicode_KIND(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200637 if (PyUnicode_IS_COMPACT_ASCII(unicode))
638 struct_size = sizeof(PyASCIIObject);
639 else
640 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200641 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200642
Victor Stinnerfe226c02011-10-03 03:52:20 +0200643 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
Victor Stinner84def372011-12-11 20:04:56 +0100644 Py_DECREF(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200645 PyErr_NoMemory();
646 return NULL;
647 }
648 new_size = (struct_size + (length + 1) * char_size);
649
Victor Stinner84def372011-12-11 20:04:56 +0100650 _Py_DEC_REFTOTAL;
651 _Py_ForgetReference(unicode);
652
653 new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
654 if (new_unicode == NULL) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200655 PyObject_Del(unicode);
656 PyErr_NoMemory();
657 return NULL;
658 }
Victor Stinner84def372011-12-11 20:04:56 +0100659 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200660 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100661
Victor Stinnerfe226c02011-10-03 03:52:20 +0200662 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200663 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200664 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200665 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
666 _PyUnicode_WSTR_LENGTH(unicode) = length;
667 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200668 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
669 length, 0);
670 return unicode;
671}
672
Alexander Belopolsky40018472011-02-26 01:02:56 +0000673static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200674resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000675{
Victor Stinner95663112011-10-04 01:03:50 +0200676 wchar_t *wstr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200678 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000679
Victor Stinner95663112011-10-04 01:03:50 +0200680 _PyUnicode_DIRTY(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200681
682 if (PyUnicode_IS_READY(unicode)) {
683 Py_ssize_t char_size;
684 Py_ssize_t new_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200685 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200686 void *data;
687
688 data = _PyUnicode_DATA_ANY(unicode);
689 assert(data != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200690 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200691 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
692 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinner95663112011-10-04 01:03:50 +0200693 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
694 {
695 PyObject_DEL(_PyUnicode_UTF8(unicode));
696 _PyUnicode_UTF8(unicode) = NULL;
697 _PyUnicode_UTF8_LENGTH(unicode) = 0;
698 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200699
700 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
701 PyErr_NoMemory();
702 return -1;
703 }
704 new_size = (length + 1) * char_size;
705
706 data = (PyObject *)PyObject_REALLOC(data, new_size);
707 if (data == NULL) {
708 PyErr_NoMemory();
709 return -1;
710 }
711 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200712 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200713 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200714 _PyUnicode_WSTR_LENGTH(unicode) = length;
715 }
716 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200717 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200718 _PyUnicode_UTF8_LENGTH(unicode) = length;
719 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720 _PyUnicode_LENGTH(unicode) = length;
721 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinner95663112011-10-04 01:03:50 +0200722 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200723 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200724 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200725 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200726 }
Victor Stinner95663112011-10-04 01:03:50 +0200727 assert(_PyUnicode_WSTR(unicode) != NULL);
728
729 /* check for integer overflow */
730 if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
731 PyErr_NoMemory();
732 return -1;
733 }
734 wstr = _PyUnicode_WSTR(unicode);
735 wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
736 if (!wstr) {
737 PyErr_NoMemory();
738 return -1;
739 }
740 _PyUnicode_WSTR(unicode) = wstr;
741 _PyUnicode_WSTR(unicode)[length] = 0;
742 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200743 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 return 0;
745}
746
Victor Stinnerfe226c02011-10-03 03:52:20 +0200747static PyObject*
748resize_copy(PyObject *unicode, Py_ssize_t length)
749{
750 Py_ssize_t copy_length;
751 if (PyUnicode_IS_COMPACT(unicode)) {
752 PyObject *copy;
753 assert(PyUnicode_IS_READY(unicode));
754
755 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
756 if (copy == NULL)
757 return NULL;
758
759 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200760 copy_characters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200761 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200762 }
763 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200764 PyObject *w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200765 assert(_PyUnicode_WSTR(unicode) != NULL);
766 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200767 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200768 if (w == NULL)
769 return NULL;
770 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
771 copy_length = Py_MIN(copy_length, length);
772 Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
773 copy_length);
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200774 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200775 }
776}
777
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000779 Ux0000 terminated; some code (e.g. new_identifier)
780 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000781
782 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000783 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000784
785*/
786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200787#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200788static int unicode_old_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200789#endif
790
Alexander Belopolsky40018472011-02-26 01:02:56 +0000791static PyUnicodeObject *
792_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793{
794 register PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200795 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000798 if (length == 0 && unicode_empty != NULL) {
799 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200800 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000801 }
802
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000803 /* Ensure we won't overflow the size. */
804 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
805 return (PyUnicodeObject *)PyErr_NoMemory();
806 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200807 if (length < 0) {
808 PyErr_SetString(PyExc_SystemError,
809 "Negative size passed to _PyUnicode_New");
810 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 }
812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200813#ifdef Py_DEBUG
814 ++unicode_old_new_calls;
815#endif
816
817 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
818 if (unicode == NULL)
819 return NULL;
820 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
821 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
822 if (!_PyUnicode_WSTR(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +0000823 PyErr_NoMemory();
824 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200826
Jeremy Hyltond8082792003-09-16 19:41:39 +0000827 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000828 * the caller fails before initializing str -- unicode_resize()
829 * reads str[0], and the Keep-Alive optimization can keep memory
830 * allocated for str alive across a call to unicode_dealloc(unicode).
831 * We don't want unicode_resize to read uninitialized memory in
832 * that case.
833 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200834 _PyUnicode_WSTR(unicode)[0] = 0;
835 _PyUnicode_WSTR(unicode)[length] = 0;
836 _PyUnicode_WSTR_LENGTH(unicode) = length;
837 _PyUnicode_HASH(unicode) = -1;
838 _PyUnicode_STATE(unicode).interned = 0;
839 _PyUnicode_STATE(unicode).kind = 0;
840 _PyUnicode_STATE(unicode).compact = 0;
841 _PyUnicode_STATE(unicode).ready = 0;
842 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +0200843 _PyUnicode_DATA_ANY(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200844 _PyUnicode_LENGTH(unicode) = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200845 _PyUnicode_UTF8(unicode) = NULL;
846 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +0100847 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000848 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000849
Benjamin Peterson29060642009-01-31 22:14:21 +0000850 onError:
Amaury Forgeot d'Arc7888d082008-08-01 01:06:32 +0000851 /* XXX UNREF/NEWREF interface should be more symmetrical */
852 _Py_DEC_REFTOTAL;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000853 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000854 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000855 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856}
857
Victor Stinnerf42dc442011-10-02 23:33:16 +0200858static const char*
859unicode_kind_name(PyObject *unicode)
860{
Victor Stinner42dfd712011-10-03 14:41:45 +0200861 /* don't check consistency: unicode_kind_name() is called from
862 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200863 if (!PyUnicode_IS_COMPACT(unicode))
864 {
865 if (!PyUnicode_IS_READY(unicode))
866 return "wstr";
867 switch(PyUnicode_KIND(unicode))
868 {
869 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200870 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200871 return "legacy ascii";
872 else
873 return "legacy latin1";
874 case PyUnicode_2BYTE_KIND:
875 return "legacy UCS2";
876 case PyUnicode_4BYTE_KIND:
877 return "legacy UCS4";
878 default:
879 return "<legacy invalid kind>";
880 }
881 }
882 assert(PyUnicode_IS_READY(unicode));
883 switch(PyUnicode_KIND(unicode))
884 {
885 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200886 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200887 return "ascii";
888 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200889 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200890 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200891 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200892 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200893 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200894 default:
895 return "<invalid compact kind>";
896 }
897}
898
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200899#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +0200900static int unicode_new_new_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200901
902/* Functions wrapping macros for use in debugger */
903char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200904 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200905}
906
907void *_PyUnicode_compact_data(void *unicode) {
908 return _PyUnicode_COMPACT_DATA(unicode);
909}
910void *_PyUnicode_data(void *unicode){
911 printf("obj %p\n", unicode);
912 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
913 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
914 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
915 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
916 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
917 return PyUnicode_DATA(unicode);
918}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200919
920void
921_PyUnicode_Dump(PyObject *op)
922{
923 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200924 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
925 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
926 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +0200927
Victor Stinnera849a4b2011-10-03 12:12:11 +0200928 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +0200929 {
930 if (ascii->state.ascii)
931 data = (ascii + 1);
932 else
933 data = (compact + 1);
934 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200935 else
936 data = unicode->data.any;
Victor Stinner0d60e872011-10-23 19:47:19 +0200937 printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
938
Victor Stinnera849a4b2011-10-03 12:12:11 +0200939 if (ascii->wstr == data)
940 printf("shared ");
941 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +0200942
Victor Stinnera3b334d2011-10-03 13:53:37 +0200943 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnera849a4b2011-10-03 12:12:11 +0200944 printf(" (%zu), ", compact->wstr_length);
945 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
946 printf("shared ");
947 printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200948 }
Victor Stinnera849a4b2011-10-03 12:12:11 +0200949 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200950}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200951#endif
952
953PyObject *
954PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
955{
956 PyObject *obj;
957 PyCompactUnicodeObject *unicode;
958 void *data;
959 int kind_state;
Victor Stinner9e9d6892011-10-04 01:02:02 +0200960 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961 Py_ssize_t char_size;
962 Py_ssize_t struct_size;
963
964 /* Optimization for empty strings */
965 if (size == 0 && unicode_empty != NULL) {
966 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200967 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968 }
969
970#ifdef Py_DEBUG
971 ++unicode_new_new_calls;
972#endif
973
Victor Stinner9e9d6892011-10-04 01:02:02 +0200974 is_ascii = 0;
975 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976 struct_size = sizeof(PyCompactUnicodeObject);
977 if (maxchar < 128) {
978 kind_state = PyUnicode_1BYTE_KIND;
979 char_size = 1;
980 is_ascii = 1;
981 struct_size = sizeof(PyASCIIObject);
982 }
983 else if (maxchar < 256) {
984 kind_state = PyUnicode_1BYTE_KIND;
985 char_size = 1;
986 }
987 else if (maxchar < 65536) {
988 kind_state = PyUnicode_2BYTE_KIND;
989 char_size = 2;
990 if (sizeof(wchar_t) == 2)
991 is_sharing = 1;
992 }
993 else {
994 kind_state = PyUnicode_4BYTE_KIND;
995 char_size = 4;
996 if (sizeof(wchar_t) == 4)
997 is_sharing = 1;
998 }
999
1000 /* Ensure we won't overflow the size. */
1001 if (size < 0) {
1002 PyErr_SetString(PyExc_SystemError,
1003 "Negative size passed to PyUnicode_New");
1004 return NULL;
1005 }
1006 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1007 return PyErr_NoMemory();
1008
1009 /* Duplicated allocation code from _PyObject_New() instead of a call to
1010 * PyObject_New() so we are able to allocate space for the object and
1011 * it's data buffer.
1012 */
1013 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1014 if (obj == NULL)
1015 return PyErr_NoMemory();
1016 obj = PyObject_INIT(obj, &PyUnicode_Type);
1017 if (obj == NULL)
1018 return NULL;
1019
1020 unicode = (PyCompactUnicodeObject *)obj;
1021 if (is_ascii)
1022 data = ((PyASCIIObject*)obj) + 1;
1023 else
1024 data = unicode + 1;
1025 _PyUnicode_LENGTH(unicode) = size;
1026 _PyUnicode_HASH(unicode) = -1;
1027 _PyUnicode_STATE(unicode).interned = 0;
1028 _PyUnicode_STATE(unicode).kind = kind_state;
1029 _PyUnicode_STATE(unicode).compact = 1;
1030 _PyUnicode_STATE(unicode).ready = 1;
1031 _PyUnicode_STATE(unicode).ascii = is_ascii;
1032 if (is_ascii) {
1033 ((char*)data)[size] = 0;
1034 _PyUnicode_WSTR(unicode) = NULL;
1035 }
1036 else if (kind_state == PyUnicode_1BYTE_KIND) {
1037 ((char*)data)[size] = 0;
1038 _PyUnicode_WSTR(unicode) = NULL;
1039 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001041 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001042 }
1043 else {
1044 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001045 unicode->utf8_length = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001046 if (kind_state == PyUnicode_2BYTE_KIND)
1047 ((Py_UCS2*)data)[size] = 0;
1048 else /* kind_state == PyUnicode_4BYTE_KIND */
1049 ((Py_UCS4*)data)[size] = 0;
1050 if (is_sharing) {
1051 _PyUnicode_WSTR_LENGTH(unicode) = size;
1052 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1053 }
1054 else {
1055 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1056 _PyUnicode_WSTR(unicode) = NULL;
1057 }
1058 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01001059 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 return obj;
1061}
1062
1063#if SIZEOF_WCHAR_T == 2
1064/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1065 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001066 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067
1068 This function assumes that unicode can hold one more code point than wstr
1069 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001070static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001071unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001072 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073{
1074 const wchar_t *iter;
1075 Py_UCS4 *ucs4_out;
1076
Victor Stinner910337b2011-10-03 03:20:16 +02001077 assert(unicode != NULL);
1078 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001079 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1080 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1081
1082 for (iter = begin; iter < end; ) {
1083 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1084 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001085 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1086 && (iter+1) < end
1087 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001088 {
Victor Stinner551ac952011-11-29 22:58:13 +01001089 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001090 iter += 2;
1091 }
1092 else {
1093 *ucs4_out++ = *iter;
1094 iter++;
1095 }
1096 }
1097 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1098 _PyUnicode_GET_LENGTH(unicode)));
1099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001100}
1101#endif
1102
Victor Stinnercd9950f2011-10-02 00:34:53 +02001103static int
1104_PyUnicode_Dirty(PyObject *unicode)
1105{
Victor Stinner910337b2011-10-03 03:20:16 +02001106 assert(_PyUnicode_CHECK(unicode));
Victor Stinnercd9950f2011-10-02 00:34:53 +02001107 if (Py_REFCNT(unicode) != 1) {
Victor Stinner01698042011-10-04 00:04:26 +02001108 PyErr_SetString(PyExc_SystemError,
Victor Stinnercd9950f2011-10-02 00:34:53 +02001109 "Cannot modify a string having more than 1 reference");
1110 return -1;
1111 }
1112 _PyUnicode_DIRTY(unicode);
1113 return 0;
1114}
1115
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001116static int
1117_copy_characters(PyObject *to, Py_ssize_t to_start,
1118 PyObject *from, Py_ssize_t from_start,
1119 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001121 unsigned int from_kind, to_kind;
1122 void *from_data, *to_data;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001123 int fast;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001124
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001125 assert(PyUnicode_Check(from));
1126 assert(PyUnicode_Check(to));
1127 assert(PyUnicode_IS_READY(from));
1128 assert(PyUnicode_IS_READY(to));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001130 assert(PyUnicode_GET_LENGTH(from) >= how_many);
1131 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1132 assert(0 <= how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001133
Victor Stinnerf5ca1a22011-09-28 23:54:59 +02001134 if (how_many == 0)
1135 return 0;
1136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001137 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001138 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001139 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001140 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001142#ifdef Py_DEBUG
1143 if (!check_maxchar
1144 && (from_kind > to_kind
1145 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001146 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001147 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1148 Py_UCS4 ch;
1149 Py_ssize_t i;
1150 for (i=0; i < how_many; i++) {
1151 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1152 assert(ch <= to_maxchar);
1153 }
1154 }
1155#endif
1156 fast = (from_kind == to_kind);
1157 if (check_maxchar
1158 && (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1159 {
1160 /* deny latin1 => ascii */
1161 fast = 0;
1162 }
1163
1164 if (fast) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001165 Py_MEMCPY((char*)to_data + to_kind * to_start,
1166 (char*)from_data + from_kind * from_start,
1167 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001168 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001169 else if (from_kind == PyUnicode_1BYTE_KIND
1170 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001171 {
1172 _PyUnicode_CONVERT_BYTES(
1173 Py_UCS1, Py_UCS2,
1174 PyUnicode_1BYTE_DATA(from) + from_start,
1175 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1176 PyUnicode_2BYTE_DATA(to) + to_start
1177 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001178 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001179 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001180 && to_kind == PyUnicode_4BYTE_KIND)
1181 {
1182 _PyUnicode_CONVERT_BYTES(
1183 Py_UCS1, Py_UCS4,
1184 PyUnicode_1BYTE_DATA(from) + from_start,
1185 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1186 PyUnicode_4BYTE_DATA(to) + to_start
1187 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001188 }
1189 else if (from_kind == PyUnicode_2BYTE_KIND
1190 && to_kind == PyUnicode_4BYTE_KIND)
1191 {
1192 _PyUnicode_CONVERT_BYTES(
1193 Py_UCS2, Py_UCS4,
1194 PyUnicode_2BYTE_DATA(from) + from_start,
1195 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1196 PyUnicode_4BYTE_DATA(to) + to_start
1197 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001198 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001199 else {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001200 /* check if max_char(from substring) <= max_char(to) */
1201 if (from_kind > to_kind
1202 /* latin1 => ascii */
Victor Stinnerb9275c12011-10-05 14:01:42 +02001203 || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001204 {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 /* slow path to check for character overflow */
1206 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001208 Py_ssize_t i;
1209
Victor Stinner56c161a2011-10-06 02:47:11 +02001210#ifdef Py_DEBUG
Victor Stinnera0702ab2011-09-29 14:14:38 +02001211 for (i=0; i < how_many; i++) {
1212 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinner56c161a2011-10-06 02:47:11 +02001213 assert(ch <= to_maxchar);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001214 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1215 }
Victor Stinner56c161a2011-10-06 02:47:11 +02001216#else
1217 if (!check_maxchar) {
1218 for (i=0; i < how_many; i++) {
1219 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1220 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1221 }
1222 }
1223 else {
1224 for (i=0; i < how_many; i++) {
1225 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1226 if (ch > to_maxchar)
1227 return 1;
1228 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1229 }
1230 }
1231#endif
Victor Stinnera0702ab2011-09-29 14:14:38 +02001232 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001233 else {
Victor Stinner56c161a2011-10-06 02:47:11 +02001234 assert(0 && "inconsistent state");
1235 return 1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001236 }
1237 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001238 return 0;
1239}
1240
1241static void
1242copy_characters(PyObject *to, Py_ssize_t to_start,
1243 PyObject *from, Py_ssize_t from_start,
1244 Py_ssize_t how_many)
1245{
1246 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1247}
1248
1249Py_ssize_t
1250PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1251 PyObject *from, Py_ssize_t from_start,
1252 Py_ssize_t how_many)
1253{
1254 int err;
1255
1256 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1257 PyErr_BadInternalCall();
1258 return -1;
1259 }
1260
1261 if (PyUnicode_READY(from))
1262 return -1;
1263 if (PyUnicode_READY(to))
1264 return -1;
1265
1266 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1267 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1268 PyErr_Format(PyExc_SystemError,
1269 "Cannot write %zi characters at %zi "
1270 "in a string of %zi characters",
1271 how_many, to_start, PyUnicode_GET_LENGTH(to));
1272 return -1;
1273 }
1274
1275 if (how_many == 0)
1276 return 0;
1277
1278 if (_PyUnicode_Dirty(to))
1279 return -1;
1280
1281 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1282 if (err) {
1283 PyErr_Format(PyExc_SystemError,
1284 "Cannot copy %s characters "
1285 "into a string of %s characters",
1286 unicode_kind_name(from),
1287 unicode_kind_name(to));
1288 return -1;
1289 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001290 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001291}
1292
Victor Stinner17222162011-09-28 22:15:37 +02001293/* Find the maximum code point and count the number of surrogate pairs so a
1294 correct string length can be computed before converting a string to UCS4.
1295 This function counts single surrogates as a character and not as a pair.
1296
1297 Return 0 on success, or -1 on error. */
1298static int
1299find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1300 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301{
1302 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001303 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304
Victor Stinnerc53be962011-10-02 21:33:54 +02001305 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 *num_surrogates = 0;
1307 *maxchar = 0;
1308
1309 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310#if SIZEOF_WCHAR_T == 2
Victor Stinnerca4f2072011-11-22 03:38:40 +01001311 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1312 && (iter+1) < end
1313 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001314 {
Victor Stinner8faf8212011-12-08 22:14:11 +01001315 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 ++(*num_surrogates);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 iter += 2;
1318 }
1319 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001321 {
1322 ch = *iter;
1323 iter++;
1324 }
1325 if (ch > *maxchar) {
1326 *maxchar = ch;
1327 if (*maxchar > MAX_UNICODE) {
1328 PyErr_Format(PyExc_ValueError,
1329 "character U+%x is not in range [U+0000; U+10ffff]",
1330 ch);
1331 return -1;
1332 }
1333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001334 }
1335 return 0;
1336}
1337
1338#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001339static int unicode_ready_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001340#endif
1341
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001342int
1343_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001344{
1345 wchar_t *end;
1346 Py_UCS4 maxchar = 0;
1347 Py_ssize_t num_surrogates;
1348#if SIZEOF_WCHAR_T == 2
1349 Py_ssize_t length_wo_surrogates;
1350#endif
1351
Georg Brandl7597add2011-10-05 16:36:47 +02001352 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001353 strings were created using _PyObject_New() and where no canonical
1354 representation (the str field) has been set yet aka strings
1355 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001356 assert(_PyUnicode_CHECK(unicode));
1357 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001359 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001360 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001361 /* Actually, it should neither be interned nor be anything else: */
1362 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001363
1364#ifdef Py_DEBUG
1365 ++unicode_ready_calls;
1366#endif
1367
1368 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001369 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001370 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372
1373 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001374 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1375 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001376 PyErr_NoMemory();
1377 return -1;
1378 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001379 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001380 _PyUnicode_WSTR(unicode), end,
1381 PyUnicode_1BYTE_DATA(unicode));
1382 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1383 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1384 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1385 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001386 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001387 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001388 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 }
1390 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001391 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001392 _PyUnicode_UTF8(unicode) = NULL;
1393 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001394 }
1395 PyObject_FREE(_PyUnicode_WSTR(unicode));
1396 _PyUnicode_WSTR(unicode) = NULL;
1397 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1398 }
1399 /* In this case we might have to convert down from 4-byte native
1400 wchar_t to 2-byte unicode. */
1401 else if (maxchar < 65536) {
1402 assert(num_surrogates == 0 &&
1403 "FindMaxCharAndNumSurrogatePairs() messed up");
1404
Victor Stinner506f5922011-09-28 22:34:18 +02001405#if SIZEOF_WCHAR_T == 2
1406 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001407 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001408 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1409 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1410 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001411 _PyUnicode_UTF8(unicode) = NULL;
1412 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001413#else
1414 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001415 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001416 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001417 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001418 PyErr_NoMemory();
1419 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420 }
Victor Stinner506f5922011-09-28 22:34:18 +02001421 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1422 _PyUnicode_WSTR(unicode), end,
1423 PyUnicode_2BYTE_DATA(unicode));
1424 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1425 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1426 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001427 _PyUnicode_UTF8(unicode) = NULL;
1428 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001429 PyObject_FREE(_PyUnicode_WSTR(unicode));
1430 _PyUnicode_WSTR(unicode) = NULL;
1431 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1432#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001433 }
1434 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1435 else {
1436#if SIZEOF_WCHAR_T == 2
1437 /* in case the native representation is 2-bytes, we need to allocate a
1438 new normalized 4-byte version. */
1439 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001440 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1441 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001442 PyErr_NoMemory();
1443 return -1;
1444 }
1445 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1446 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001447 _PyUnicode_UTF8(unicode) = NULL;
1448 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001449 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1450 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001451 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452 PyObject_FREE(_PyUnicode_WSTR(unicode));
1453 _PyUnicode_WSTR(unicode) = NULL;
1454 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1455#else
1456 assert(num_surrogates == 0);
1457
Victor Stinnerc3c74152011-10-02 20:39:55 +02001458 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001459 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 _PyUnicode_UTF8(unicode) = NULL;
1461 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1463#endif
1464 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1465 }
1466 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001467 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 return 0;
1469}
1470
Alexander Belopolsky40018472011-02-26 01:02:56 +00001471static void
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001472unicode_dealloc(register PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473{
Walter Dörwald16807132007-05-25 13:52:07 +00001474 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001475 case SSTATE_NOT_INTERNED:
1476 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001477
Benjamin Peterson29060642009-01-31 22:14:21 +00001478 case SSTATE_INTERNED_MORTAL:
1479 /* revive dead object temporarily for DelItem */
1480 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001481 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001482 Py_FatalError(
1483 "deletion of interned string failed");
1484 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001485
Benjamin Peterson29060642009-01-31 22:14:21 +00001486 case SSTATE_INTERNED_IMMORTAL:
1487 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001488
Benjamin Peterson29060642009-01-31 22:14:21 +00001489 default:
1490 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001491 }
1492
Victor Stinner03490912011-10-03 23:45:12 +02001493 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001495 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001496 PyObject_DEL(_PyUnicode_UTF8(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001497
1498 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +01001499 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 }
1501 else {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001502 if (_PyUnicode_DATA_ANY(unicode))
1503 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinner7931d9a2011-11-04 00:22:48 +01001504 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 }
1506}
1507
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001508#ifdef Py_DEBUG
1509static int
1510unicode_is_singleton(PyObject *unicode)
1511{
1512 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1513 if (unicode == unicode_empty)
1514 return 1;
1515 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1516 {
1517 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1518 if (ch < 256 && unicode_latin1[ch] == unicode)
1519 return 1;
1520 }
1521 return 0;
1522}
1523#endif
1524
Alexander Belopolsky40018472011-02-26 01:02:56 +00001525static int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001526unicode_resizable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001527{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001528 if (Py_REFCNT(unicode) != 1)
1529 return 0;
1530 if (PyUnicode_CHECK_INTERNED(unicode))
1531 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001532#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001533 /* singleton refcount is greater than 1 */
1534 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001535#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001536 return 1;
1537}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001538
Victor Stinnerfe226c02011-10-03 03:52:20 +02001539static int
1540unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1541{
1542 PyObject *unicode;
1543 Py_ssize_t old_length;
1544
1545 assert(p_unicode != NULL);
1546 unicode = *p_unicode;
1547
1548 assert(unicode != NULL);
1549 assert(PyUnicode_Check(unicode));
1550 assert(0 <= length);
1551
Victor Stinner910337b2011-10-03 03:20:16 +02001552 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001553 old_length = PyUnicode_WSTR_LENGTH(unicode);
1554 else
1555 old_length = PyUnicode_GET_LENGTH(unicode);
1556 if (old_length == length)
1557 return 0;
1558
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001559 if (length == 0) {
1560 Py_DECREF(*p_unicode);
1561 *p_unicode = unicode_empty;
1562 Py_INCREF(*p_unicode);
1563 return 0;
1564 }
1565
Victor Stinnerfe226c02011-10-03 03:52:20 +02001566 if (!unicode_resizable(unicode)) {
1567 PyObject *copy = resize_copy(unicode, length);
1568 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001569 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001570 Py_DECREF(*p_unicode);
1571 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001572 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001573 }
1574
Victor Stinnerfe226c02011-10-03 03:52:20 +02001575 if (PyUnicode_IS_COMPACT(unicode)) {
1576 *p_unicode = resize_compact(unicode, length);
1577 if (*p_unicode == NULL)
1578 return -1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001579 assert(_PyUnicode_CheckConsistency(*p_unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001580 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001581 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001582 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001583}
1584
Alexander Belopolsky40018472011-02-26 01:02:56 +00001585int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001586PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001587{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001588 PyObject *unicode;
1589 if (p_unicode == NULL) {
1590 PyErr_BadInternalCall();
1591 return -1;
1592 }
1593 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001594 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001595 {
1596 PyErr_BadInternalCall();
1597 return -1;
1598 }
1599 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001600}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001601
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001602static int
Victor Stinner0a045ef2011-11-09 00:02:42 +01001603unicode_widen(PyObject **p_unicode, unsigned int maxchar)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001604{
1605 PyObject *result;
1606 assert(PyUnicode_IS_READY(*p_unicode));
1607 if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
1608 return 0;
1609 result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
1610 maxchar);
1611 if (result == NULL)
1612 return -1;
1613 PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
1614 PyUnicode_GET_LENGTH(*p_unicode));
1615 Py_DECREF(*p_unicode);
1616 *p_unicode = result;
1617 return 0;
1618}
1619
1620static int
1621unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
1622 Py_UCS4 ch)
1623{
1624 if (unicode_widen(p_unicode, ch) < 0)
1625 return -1;
1626 PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
1627 PyUnicode_DATA(*p_unicode),
1628 (*pos)++, ch);
1629 return 0;
1630}
1631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001632static PyObject*
1633get_latin1_char(unsigned char ch)
1634{
Victor Stinnera464fc12011-10-02 20:39:30 +02001635 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001636 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001637 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001638 if (!unicode)
1639 return NULL;
1640 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001641 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001642 unicode_latin1[ch] = unicode;
1643 }
1644 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001645 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001646}
1647
Alexander Belopolsky40018472011-02-26 01:02:56 +00001648PyObject *
1649PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001650{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001651 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001652 Py_UCS4 maxchar = 0;
1653 Py_ssize_t num_surrogates;
1654
1655 if (u == NULL)
1656 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001658 /* If the Unicode data is known at construction time, we can apply
1659 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001661 /* Optimization for empty strings */
1662 if (size == 0 && unicode_empty != NULL) {
1663 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001664 return unicode_empty;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001665 }
Tim Petersced69f82003-09-16 20:30:58 +00001666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001667 /* Single character Unicode objects in the Latin-1 range are
1668 shared when using this constructor */
1669 if (size == 1 && *u < 256)
1670 return get_latin1_char((unsigned char)*u);
1671
1672 /* If not empty and not single character, copy the Unicode data
1673 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001674 if (find_maxchar_surrogates(u, u + size,
1675 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676 return NULL;
1677
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 if (!unicode)
1680 return NULL;
1681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682 switch (PyUnicode_KIND(unicode)) {
1683 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001684 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1686 break;
1687 case PyUnicode_2BYTE_KIND:
1688#if Py_UNICODE_SIZE == 2
1689 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1690#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001691 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001692 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1693#endif
1694 break;
1695 case PyUnicode_4BYTE_KIND:
1696#if SIZEOF_WCHAR_T == 2
1697 /* This is the only case which has to process surrogates, thus
1698 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001699 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001700#else
1701 assert(num_surrogates == 0);
1702 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1703#endif
1704 break;
1705 default:
1706 assert(0 && "Impossible state");
1707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001709 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710}
1711
Alexander Belopolsky40018472011-02-26 01:02:56 +00001712PyObject *
1713PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001714{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001715 if (size < 0) {
1716 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001717 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001718 return NULL;
1719 }
Christian Heimes33fe8092008-04-13 13:53:33 +00001720
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001721 /* If the Unicode data is known at construction time, we can apply
Martin v. Löwis9c121062007-08-05 20:26:11 +00001722 some optimizations which share commonly used objects.
1723 Also, this means the input must be UTF-8, so fall back to the
1724 UTF-8 decoder at the end. */
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001725 if (u != NULL) {
1726
Benjamin Peterson29060642009-01-31 22:14:21 +00001727 /* Optimization for empty strings */
1728 if (size == 0 && unicode_empty != NULL) {
1729 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001730 return unicode_empty;
Benjamin Peterson14339b62009-01-31 16:36:08 +00001731 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001732
1733 /* Single characters are shared when using this constructor.
1734 Restrict to ASCII, since the input must be UTF-8. */
Victor Stinner9faa3842011-10-23 20:06:00 +02001735 if (size == 1 && (unsigned char)*u < 128)
1736 return get_latin1_char((unsigned char)*u);
Martin v. Löwis9c121062007-08-05 20:26:11 +00001737
1738 return PyUnicode_DecodeUTF8(u, size, NULL);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001739 }
1740
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001741 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001742}
1743
Alexander Belopolsky40018472011-02-26 01:02:56 +00001744PyObject *
1745PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001746{
1747 size_t size = strlen(u);
1748 if (size > PY_SSIZE_T_MAX) {
1749 PyErr_SetString(PyExc_OverflowError, "input too long");
1750 return NULL;
1751 }
1752
1753 return PyUnicode_FromStringAndSize(u, size);
1754}
1755
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001756PyObject *
1757_PyUnicode_FromId(_Py_Identifier *id)
1758{
1759 if (!id->object) {
1760 id->object = PyUnicode_FromString(id->string);
1761 if (!id->object)
1762 return NULL;
1763 PyUnicode_InternInPlace(&id->object);
1764 assert(!id->next);
1765 id->next = static_strings;
1766 static_strings = id;
1767 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001768 return id->object;
1769}
1770
1771void
1772_PyUnicode_ClearStaticStrings()
1773{
1774 _Py_Identifier *i;
1775 for (i = static_strings; i; i = i->next) {
1776 Py_DECREF(i->object);
1777 i->object = NULL;
1778 i->next = NULL;
1779 }
1780}
1781
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001782/* Internal function, don't check maximum character */
1783
Victor Stinnere57b1c02011-09-28 22:20:48 +02001784static PyObject*
Victor Stinner0617b6e2011-10-05 23:26:01 +02001785unicode_fromascii(const unsigned char* s, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001786{
Victor Stinner785938e2011-12-11 20:09:03 +01001787 PyObject *unicode;
Victor Stinner0617b6e2011-10-05 23:26:01 +02001788#ifdef Py_DEBUG
1789 const unsigned char *p;
1790 const unsigned char *end = s + size;
1791 for (p=s; p < end; p++) {
1792 assert(*p < 128);
1793 }
1794#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001795 if (size == 1)
1796 return get_latin1_char(s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01001797 unicode = PyUnicode_New(size, 127);
1798 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001799 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001800 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1801 assert(_PyUnicode_CheckConsistency(unicode, 1));
1802 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001803}
1804
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001805static Py_UCS4
1806kind_maxchar_limit(unsigned int kind)
1807{
1808 switch(kind) {
1809 case PyUnicode_1BYTE_KIND:
1810 return 0x80;
1811 case PyUnicode_2BYTE_KIND:
1812 return 0x100;
1813 case PyUnicode_4BYTE_KIND:
1814 return 0x10000;
1815 default:
1816 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001817 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001818 }
1819}
1820
Victor Stinner702c7342011-10-05 13:50:52 +02001821static PyObject*
Victor Stinnere57b1c02011-09-28 22:20:48 +02001822_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001823{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001825 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001826
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001827 if (size == 0) {
1828 Py_INCREF(unicode_empty);
1829 return unicode_empty;
1830 }
1831 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001832 if (size == 1)
1833 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001834
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001835 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001836 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001837 if (!res)
1838 return NULL;
1839 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001840 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001842}
1843
Victor Stinnere57b1c02011-09-28 22:20:48 +02001844static PyObject*
1845_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846{
1847 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001848 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001849
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001850 if (size == 0) {
1851 Py_INCREF(unicode_empty);
1852 return unicode_empty;
1853 }
1854 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001855 if (size == 1 && u[0] < 256)
Victor Stinner4e101002011-10-11 23:27:52 +02001856 return get_latin1_char((unsigned char)u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001857
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001858 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001859 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001860 if (!res)
1861 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001862 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001863 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001864 else {
1865 _PyUnicode_CONVERT_BYTES(
1866 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1867 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001868 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001869 return res;
1870}
1871
Victor Stinnere57b1c02011-09-28 22:20:48 +02001872static PyObject*
1873_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874{
1875 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001876 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001877
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001878 if (size == 0) {
1879 Py_INCREF(unicode_empty);
1880 return unicode_empty;
1881 }
1882 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001883 if (size == 1 && u[0] < 256)
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001884 return get_latin1_char((unsigned char)u[0]);
1885
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001886 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001887 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 if (!res)
1889 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02001890 if (max_char < 256)
1891 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
1892 PyUnicode_1BYTE_DATA(res));
1893 else if (max_char < 0x10000)
1894 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
1895 PyUnicode_2BYTE_DATA(res));
1896 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001898 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001899 return res;
1900}
1901
1902PyObject*
1903PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1904{
Victor Stinnercfed46e2011-11-22 01:29:14 +01001905 if (size < 0) {
1906 PyErr_SetString(PyExc_ValueError, "size must be positive");
1907 return NULL;
1908 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001909 switch(kind) {
1910 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001911 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001913 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02001915 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001916 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02001917 PyErr_SetString(PyExc_SystemError, "invalid kind");
1918 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001920}
1921
Victor Stinner25a4b292011-10-06 12:31:55 +02001922/* Ensure that a string uses the most efficient storage, if it is not the
1923 case: create a new string with of the right kind. Write NULL into *p_unicode
1924 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02001925static void
Victor Stinner25a4b292011-10-06 12:31:55 +02001926unicode_adjust_maxchar(PyObject **p_unicode)
1927{
1928 PyObject *unicode, *copy;
1929 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001930 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02001931 unsigned int kind;
1932
1933 assert(p_unicode != NULL);
1934 unicode = *p_unicode;
1935 assert(PyUnicode_IS_READY(unicode));
1936 if (PyUnicode_IS_ASCII(unicode))
1937 return;
1938
1939 len = PyUnicode_GET_LENGTH(unicode);
1940 kind = PyUnicode_KIND(unicode);
1941 if (kind == PyUnicode_1BYTE_KIND) {
1942 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001943 max_char = ucs1lib_find_max_char(u, u + len);
1944 if (max_char >= 128)
1945 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001946 }
1947 else if (kind == PyUnicode_2BYTE_KIND) {
1948 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001949 max_char = ucs2lib_find_max_char(u, u + len);
1950 if (max_char >= 256)
1951 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001952 }
1953 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001954 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02001955 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001956 max_char = ucs4lib_find_max_char(u, u + len);
1957 if (max_char >= 0x10000)
1958 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02001959 }
Victor Stinner25a4b292011-10-06 12:31:55 +02001960 copy = PyUnicode_New(len, max_char);
1961 copy_characters(copy, 0, unicode, 0, len);
1962 Py_DECREF(unicode);
1963 *p_unicode = copy;
1964}
1965
Victor Stinner034f6cf2011-09-30 02:26:44 +02001966PyObject*
1967PyUnicode_Copy(PyObject *unicode)
1968{
Victor Stinner87af4f22011-11-21 23:03:47 +01001969 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001970 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001971
Victor Stinner034f6cf2011-09-30 02:26:44 +02001972 if (!PyUnicode_Check(unicode)) {
1973 PyErr_BadInternalCall();
1974 return NULL;
1975 }
1976 if (PyUnicode_READY(unicode))
1977 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001978
Victor Stinner87af4f22011-11-21 23:03:47 +01001979 length = PyUnicode_GET_LENGTH(unicode);
1980 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001981 if (!copy)
1982 return NULL;
1983 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1984
Victor Stinner87af4f22011-11-21 23:03:47 +01001985 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
1986 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001987 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02001988 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02001989}
1990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001991
Victor Stinnerbc603d12011-10-02 01:00:40 +02001992/* Widen Unicode objects to larger buffers. Don't write terminating null
1993 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001994
1995void*
1996_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1997{
Victor Stinnerbc603d12011-10-02 01:00:40 +02001998 Py_ssize_t len;
1999 void *result;
2000 unsigned int skind;
2001
2002 if (PyUnicode_READY(s))
2003 return NULL;
2004
2005 len = PyUnicode_GET_LENGTH(s);
2006 skind = PyUnicode_KIND(s);
2007 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002008 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 return NULL;
2010 }
2011 switch(kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002012 case PyUnicode_2BYTE_KIND:
2013 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2014 if (!result)
2015 return PyErr_NoMemory();
2016 assert(skind == PyUnicode_1BYTE_KIND);
2017 _PyUnicode_CONVERT_BYTES(
2018 Py_UCS1, Py_UCS2,
2019 PyUnicode_1BYTE_DATA(s),
2020 PyUnicode_1BYTE_DATA(s) + len,
2021 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002022 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002023 case PyUnicode_4BYTE_KIND:
2024 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2025 if (!result)
2026 return PyErr_NoMemory();
2027 if (skind == PyUnicode_2BYTE_KIND) {
2028 _PyUnicode_CONVERT_BYTES(
2029 Py_UCS2, Py_UCS4,
2030 PyUnicode_2BYTE_DATA(s),
2031 PyUnicode_2BYTE_DATA(s) + len,
2032 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002033 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002034 else {
2035 assert(skind == PyUnicode_1BYTE_KIND);
2036 _PyUnicode_CONVERT_BYTES(
2037 Py_UCS1, Py_UCS4,
2038 PyUnicode_1BYTE_DATA(s),
2039 PyUnicode_1BYTE_DATA(s) + len,
2040 result);
2041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002043 default:
2044 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002045 }
Victor Stinner01698042011-10-04 00:04:26 +02002046 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002047 return NULL;
2048}
2049
2050static Py_UCS4*
2051as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2052 int copy_null)
2053{
2054 int kind;
2055 void *data;
2056 Py_ssize_t len, targetlen;
2057 if (PyUnicode_READY(string) == -1)
2058 return NULL;
2059 kind = PyUnicode_KIND(string);
2060 data = PyUnicode_DATA(string);
2061 len = PyUnicode_GET_LENGTH(string);
2062 targetlen = len;
2063 if (copy_null)
2064 targetlen++;
2065 if (!target) {
2066 if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2067 PyErr_NoMemory();
2068 return NULL;
2069 }
2070 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2071 if (!target) {
2072 PyErr_NoMemory();
2073 return NULL;
2074 }
2075 }
2076 else {
2077 if (targetsize < targetlen) {
2078 PyErr_Format(PyExc_SystemError,
2079 "string is longer than the buffer");
2080 if (copy_null && 0 < targetsize)
2081 target[0] = 0;
2082 return NULL;
2083 }
2084 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002085 if (kind == PyUnicode_1BYTE_KIND) {
2086 Py_UCS1 *start = (Py_UCS1 *) data;
2087 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002088 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002089 else if (kind == PyUnicode_2BYTE_KIND) {
2090 Py_UCS2 *start = (Py_UCS2 *) data;
2091 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2092 }
2093 else {
2094 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002097 if (copy_null)
2098 target[len] = 0;
2099 return target;
2100}
2101
2102Py_UCS4*
2103PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2104 int copy_null)
2105{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002106 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 PyErr_BadInternalCall();
2108 return NULL;
2109 }
2110 return as_ucs4(string, target, targetsize, copy_null);
2111}
2112
2113Py_UCS4*
2114PyUnicode_AsUCS4Copy(PyObject *string)
2115{
2116 return as_ucs4(string, NULL, 0, 1);
2117}
2118
2119#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002120
Alexander Belopolsky40018472011-02-26 01:02:56 +00002121PyObject *
2122PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 if (w == NULL) {
Victor Stinner382955f2011-12-11 21:44:00 +01002125 if (size == 0) {
2126 Py_INCREF(unicode_empty);
2127 return unicode_empty;
2128 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002129 PyErr_BadInternalCall();
2130 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 }
2132
Martin v. Löwis790465f2008-04-05 20:41:37 +00002133 if (size == -1) {
2134 size = wcslen(w);
2135 }
2136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138}
2139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002141
Walter Dörwald346737f2007-05-31 10:44:43 +00002142static void
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002143makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2144 int zeropad, int width, int precision, char c)
Walter Dörwald346737f2007-05-31 10:44:43 +00002145{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002146 *fmt++ = '%';
2147 if (width) {
2148 if (zeropad)
2149 *fmt++ = '0';
2150 fmt += sprintf(fmt, "%d", width);
2151 }
2152 if (precision)
2153 fmt += sprintf(fmt, ".%d", precision);
2154 if (longflag)
2155 *fmt++ = 'l';
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002156 else if (longlongflag) {
2157 /* longlongflag should only ever be nonzero on machines with
2158 HAVE_LONG_LONG defined */
2159#ifdef HAVE_LONG_LONG
2160 char *f = PY_FORMAT_LONG_LONG;
2161 while (*f)
2162 *fmt++ = *f++;
2163#else
2164 /* we shouldn't ever get here */
2165 assert(0);
2166 *fmt++ = 'l';
2167#endif
2168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002169 else if (size_tflag) {
2170 char *f = PY_FORMAT_SIZE_T;
2171 while (*f)
2172 *fmt++ = *f++;
2173 }
2174 *fmt++ = c;
2175 *fmt = '\0';
Walter Dörwald346737f2007-05-31 10:44:43 +00002176}
2177
Victor Stinner96865452011-03-01 23:44:09 +00002178/* helper for PyUnicode_FromFormatV() */
2179
2180static const char*
2181parse_format_flags(const char *f,
2182 int *p_width, int *p_precision,
2183 int *p_longflag, int *p_longlongflag, int *p_size_tflag)
2184{
2185 int width, precision, longflag, longlongflag, size_tflag;
2186
2187 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2188 f++;
2189 width = 0;
2190 while (Py_ISDIGIT((unsigned)*f))
2191 width = (width*10) + *f++ - '0';
2192 precision = 0;
2193 if (*f == '.') {
2194 f++;
2195 while (Py_ISDIGIT((unsigned)*f))
2196 precision = (precision*10) + *f++ - '0';
2197 if (*f == '%') {
2198 /* "%.3%s" => f points to "3" */
2199 f--;
2200 }
2201 }
2202 if (*f == '\0') {
2203 /* bogus format "%.1" => go backward, f points to "1" */
2204 f--;
2205 }
2206 if (p_width != NULL)
2207 *p_width = width;
2208 if (p_precision != NULL)
2209 *p_precision = precision;
2210
2211 /* Handle %ld, %lu, %lld and %llu. */
2212 longflag = 0;
2213 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002214 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002215
2216 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002217 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002218 longflag = 1;
2219 ++f;
2220 }
2221#ifdef HAVE_LONG_LONG
2222 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002223 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002224 longlongflag = 1;
2225 f += 2;
2226 }
2227#endif
2228 }
2229 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002230 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002231 size_tflag = 1;
2232 ++f;
2233 }
2234 if (p_longflag != NULL)
2235 *p_longflag = longflag;
2236 if (p_longlongflag != NULL)
2237 *p_longlongflag = longlongflag;
2238 if (p_size_tflag != NULL)
2239 *p_size_tflag = size_tflag;
2240 return f;
2241}
2242
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002243/* maximum number of characters required for output of %ld. 21 characters
2244 allows for 64-bit integers (in decimal) and an optional sign. */
2245#define MAX_LONG_CHARS 21
2246/* maximum number of characters required for output of %lld.
2247 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2248 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2249#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2250
Walter Dörwaldd2034312007-05-18 16:29:38 +00002251PyObject *
2252PyUnicode_FromFormatV(const char *format, va_list vargs)
2253{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002254 va_list count;
2255 Py_ssize_t callcount = 0;
2256 PyObject **callresults = NULL;
2257 PyObject **callresult = NULL;
2258 Py_ssize_t n = 0;
2259 int width = 0;
2260 int precision = 0;
2261 int zeropad;
2262 const char* f;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002263 PyObject *string;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002264 /* used by sprintf */
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002265 char fmt[61]; /* should be enough for %0width.precisionlld */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002266 Py_UCS4 maxchar = 127; /* result is ASCII by default */
2267 Py_UCS4 argmaxchar;
2268 Py_ssize_t numbersize = 0;
2269 char *numberresults = NULL;
2270 char *numberresult = NULL;
2271 Py_ssize_t i;
2272 int kind;
2273 void *data;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002274
Victor Stinner4a2b7a12010-08-13 14:03:48 +00002275 Py_VA_COPY(count, vargs);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002276 /* step 1: count the number of %S/%R/%A/%s format specifications
2277 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
2278 * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002279 * result in an array)
Georg Brandl7597add2011-10-05 16:36:47 +02002280 * also estimate a upper bound for all the number formats in the string,
2281 * numbers will be formatted in step 3 and be kept in a '\0'-separated
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002282 * buffer before putting everything together. */
Benjamin Peterson14339b62009-01-31 16:36:08 +00002283 for (f = format; *f; f++) {
2284 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002285 int longlongflag;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 /* skip width or width.precision (eg. "1.2" of "%1.2f") */
2287 f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
2288 if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
2289 ++callcount;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002290
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291 else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
Mark Dickinson6ce4a9a2009-11-16 17:00:11 +00002292#ifdef HAVE_LONG_LONG
2293 if (longlongflag) {
2294 if (width < MAX_LONG_LONG_CHARS)
2295 width = MAX_LONG_LONG_CHARS;
2296 }
2297 else
2298#endif
2299 /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
2300 including sign. Decimal takes the most space. This
2301 isn't enough for octal. If a width is specified we
2302 need more (which we allocate later). */
2303 if (width < MAX_LONG_CHARS)
2304 width = MAX_LONG_CHARS;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002305
2306 /* account for the size + '\0' to separate numbers
2307 inside of the numberresults buffer */
2308 numbersize += (width + 1);
2309 }
2310 }
2311 else if ((unsigned char)*f > 127) {
2312 PyErr_Format(PyExc_ValueError,
2313 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2314 "string, got a non-ASCII byte: 0x%02x",
2315 (unsigned char)*f);
2316 return NULL;
2317 }
2318 }
2319 /* step 2: allocate memory for the results of
2320 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
2321 if (callcount) {
2322 callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
2323 if (!callresults) {
2324 PyErr_NoMemory();
2325 return NULL;
2326 }
2327 callresult = callresults;
2328 }
2329 /* step 2.5: allocate memory for the results of formating numbers */
2330 if (numbersize) {
2331 numberresults = PyObject_Malloc(numbersize);
2332 if (!numberresults) {
2333 PyErr_NoMemory();
2334 goto fail;
2335 }
2336 numberresult = numberresults;
2337 }
2338
2339 /* step 3: format numbers and figure out how large a buffer we need */
2340 for (f = format; *f; f++) {
2341 if (*f == '%') {
2342 const char* p;
2343 int longflag;
2344 int longlongflag;
2345 int size_tflag;
2346 int numprinted;
2347
2348 p = f;
2349 zeropad = (f[1] == '0');
2350 f = parse_format_flags(f, &width, &precision,
2351 &longflag, &longlongflag, &size_tflag);
2352 switch (*f) {
2353 case 'c':
2354 {
2355 Py_UCS4 ordinal = va_arg(count, int);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002356 maxchar = Py_MAX(maxchar, ordinal);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002357 n++;
2358 break;
2359 }
2360 case '%':
2361 n++;
2362 break;
2363 case 'i':
2364 case 'd':
2365 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2366 width, precision, *f);
2367 if (longflag)
2368 numprinted = sprintf(numberresult, fmt,
2369 va_arg(count, long));
2370#ifdef HAVE_LONG_LONG
2371 else if (longlongflag)
2372 numprinted = sprintf(numberresult, fmt,
2373 va_arg(count, PY_LONG_LONG));
2374#endif
2375 else if (size_tflag)
2376 numprinted = sprintf(numberresult, fmt,
2377 va_arg(count, Py_ssize_t));
2378 else
2379 numprinted = sprintf(numberresult, fmt,
2380 va_arg(count, int));
2381 n += numprinted;
2382 /* advance by +1 to skip over the '\0' */
2383 numberresult += (numprinted + 1);
2384 assert(*(numberresult - 1) == '\0');
2385 assert(*(numberresult - 2) != '\0');
2386 assert(numprinted >= 0);
2387 assert(numberresult <= numberresults + numbersize);
2388 break;
2389 case 'u':
2390 makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2391 width, precision, 'u');
2392 if (longflag)
2393 numprinted = sprintf(numberresult, fmt,
2394 va_arg(count, unsigned long));
2395#ifdef HAVE_LONG_LONG
2396 else if (longlongflag)
2397 numprinted = sprintf(numberresult, fmt,
2398 va_arg(count, unsigned PY_LONG_LONG));
2399#endif
2400 else if (size_tflag)
2401 numprinted = sprintf(numberresult, fmt,
2402 va_arg(count, size_t));
2403 else
2404 numprinted = sprintf(numberresult, fmt,
2405 va_arg(count, unsigned int));
2406 n += numprinted;
2407 numberresult += (numprinted + 1);
2408 assert(*(numberresult - 1) == '\0');
2409 assert(*(numberresult - 2) != '\0');
2410 assert(numprinted >= 0);
2411 assert(numberresult <= numberresults + numbersize);
2412 break;
2413 case 'x':
2414 makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2415 numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2416 n += numprinted;
2417 numberresult += (numprinted + 1);
2418 assert(*(numberresult - 1) == '\0');
2419 assert(*(numberresult - 2) != '\0');
2420 assert(numprinted >= 0);
2421 assert(numberresult <= numberresults + numbersize);
2422 break;
2423 case 'p':
2424 numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2425 /* %p is ill-defined: ensure leading 0x. */
2426 if (numberresult[1] == 'X')
2427 numberresult[1] = 'x';
2428 else if (numberresult[1] != 'x') {
2429 memmove(numberresult + 2, numberresult,
2430 strlen(numberresult) + 1);
2431 numberresult[0] = '0';
2432 numberresult[1] = 'x';
2433 numprinted += 2;
2434 }
2435 n += numprinted;
2436 numberresult += (numprinted + 1);
2437 assert(*(numberresult - 1) == '\0');
2438 assert(*(numberresult - 2) != '\0');
2439 assert(numprinted >= 0);
2440 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002441 break;
2442 case 's':
2443 {
2444 /* UTF-8 */
Georg Brandl780b2a62009-05-05 09:19:59 +00002445 const char *s = va_arg(count, const char*);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002446 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2447 if (!str)
2448 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 /* since PyUnicode_DecodeUTF8 returns already flexible
2450 unicode objects, there is no need to call ready on them */
2451 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002452 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 n += PyUnicode_GET_LENGTH(str);
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002454 /* Remember the str and switch to the next slot */
2455 *callresult++ = str;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002456 break;
2457 }
2458 case 'U':
2459 {
2460 PyObject *obj = va_arg(count, PyObject *);
Victor Stinner910337b2011-10-03 03:20:16 +02002461 assert(obj && _PyUnicode_CHECK(obj));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002462 if (PyUnicode_READY(obj) == -1)
2463 goto fail;
2464 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002465 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002466 n += PyUnicode_GET_LENGTH(obj);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002467 break;
2468 }
2469 case 'V':
2470 {
2471 PyObject *obj = va_arg(count, PyObject *);
2472 const char *str = va_arg(count, const char *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002473 PyObject *str_obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002474 assert(obj || str);
Victor Stinner910337b2011-10-03 03:20:16 +02002475 assert(!obj || _PyUnicode_CHECK(obj));
Victor Stinner2512a8b2011-03-01 22:46:52 +00002476 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 if (PyUnicode_READY(obj) == -1)
2478 goto fail;
2479 argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002480 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002481 n += PyUnicode_GET_LENGTH(obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002482 *callresult++ = NULL;
2483 }
2484 else {
2485 str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2486 if (!str_obj)
2487 goto fail;
Victor Stinnere1335c72011-10-04 20:53:03 +02002488 if (PyUnicode_READY(str_obj)) {
2489 Py_DECREF(str_obj);
2490 goto fail;
2491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002493 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 n += PyUnicode_GET_LENGTH(str_obj);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002495 *callresult++ = str_obj;
2496 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002497 break;
2498 }
2499 case 'S':
2500 {
2501 PyObject *obj = va_arg(count, PyObject *);
2502 PyObject *str;
2503 assert(obj);
2504 str = PyObject_Str(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002506 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002508 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509 n += PyUnicode_GET_LENGTH(str);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002510 /* Remember the str and switch to the next slot */
2511 *callresult++ = str;
2512 break;
2513 }
2514 case 'R':
2515 {
2516 PyObject *obj = va_arg(count, PyObject *);
2517 PyObject *repr;
2518 assert(obj);
2519 repr = PyObject_Repr(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 if (!repr || PyUnicode_READY(repr) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002521 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002522 argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002523 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002524 n += PyUnicode_GET_LENGTH(repr);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002525 /* Remember the repr and switch to the next slot */
2526 *callresult++ = repr;
2527 break;
2528 }
2529 case 'A':
2530 {
2531 PyObject *obj = va_arg(count, PyObject *);
2532 PyObject *ascii;
2533 assert(obj);
2534 ascii = PyObject_ASCII(obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002535 if (!ascii || PyUnicode_READY(ascii) == -1)
Benjamin Peterson14339b62009-01-31 16:36:08 +00002536 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002537 argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
Georg Brandl4cb0de22011-09-28 21:49:49 +02002538 maxchar = Py_MAX(maxchar, argmaxchar);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002539 n += PyUnicode_GET_LENGTH(ascii);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002540 /* Remember the repr and switch to the next slot */
2541 *callresult++ = ascii;
2542 break;
2543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002544 default:
2545 /* if we stumble upon an unknown
2546 formatting code, copy the rest of
2547 the format string to the output
2548 string. (we cannot just skip the
2549 code, since there's no way to know
2550 what's in the argument list) */
2551 n += strlen(p);
2552 goto expand;
2553 }
2554 } else
2555 n++;
2556 }
Benjamin Peterson29060642009-01-31 22:14:21 +00002557 expand:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002558 /* step 4: fill the buffer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 /* Since we've analyzed how much space we need,
Benjamin Peterson14339b62009-01-31 16:36:08 +00002560 we don't have to resize the string.
2561 There can be no errors beyond this point. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002562 string = PyUnicode_New(n, maxchar);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002563 if (!string)
2564 goto fail;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002565 kind = PyUnicode_KIND(string);
2566 data = PyUnicode_DATA(string);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002567 callresult = callresults;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002568 numberresult = numberresults;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002570 for (i = 0, f = format; *f; f++) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002571 if (*f == '%') {
Victor Stinner96865452011-03-01 23:44:09 +00002572 const char* p;
Victor Stinner96865452011-03-01 23:44:09 +00002573
2574 p = f;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002575 f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2576 /* checking for == because the last argument could be a empty
2577 string, which causes i to point to end, the assert at the end of
2578 the loop */
2579 assert(i <= PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002580
Benjamin Peterson14339b62009-01-31 16:36:08 +00002581 switch (*f) {
2582 case 'c':
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002583 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002584 const int ordinal = va_arg(vargs, int);
2585 PyUnicode_WRITE(kind, data, i++, ordinal);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002586 break;
Victor Stinner5ed8b2c2011-02-21 21:13:44 +00002587 }
Victor Stinner6d970f42011-03-02 00:04:25 +00002588 case 'i':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002589 case 'd':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002590 case 'u':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002591 case 'x':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002592 case 'p':
2593 /* unused, since we already have the result */
2594 if (*f == 'p')
2595 (void) va_arg(vargs, void *);
2596 else
2597 (void) va_arg(vargs, int);
2598 /* extract the result from numberresults and append. */
2599 for (; *numberresult; ++i, ++numberresult)
2600 PyUnicode_WRITE(kind, data, i, *numberresult);
2601 /* skip over the separating '\0' */
2602 assert(*numberresult == '\0');
2603 numberresult++;
2604 assert(numberresult <= numberresults + numbersize);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002605 break;
2606 case 's':
2607 {
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002608 /* unused, since we already have the result */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002609 Py_ssize_t size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002610 (void) va_arg(vargs, char *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002611 size = PyUnicode_GET_LENGTH(*callresult);
2612 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002613 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002614 i += size;
Walter Dörwaldc1651a02009-05-03 22:55:55 +00002615 /* We're done with the unicode()/repr() => forget it */
2616 Py_DECREF(*callresult);
2617 /* switch to next unicode()/repr() result */
2618 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002619 break;
2620 }
2621 case 'U':
2622 {
2623 PyObject *obj = va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624 Py_ssize_t size;
2625 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2626 size = PyUnicode_GET_LENGTH(obj);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002627 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002628 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002629 break;
2630 }
2631 case 'V':
2632 {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002633 Py_ssize_t size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002634 PyObject *obj = va_arg(vargs, PyObject *);
Victor Stinner2512a8b2011-03-01 22:46:52 +00002635 va_arg(vargs, const char *);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002636 if (obj) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002637 size = PyUnicode_GET_LENGTH(obj);
2638 assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002639 copy_characters(string, i, obj, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002640 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002641 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642 size = PyUnicode_GET_LENGTH(*callresult);
2643 assert(PyUnicode_KIND(*callresult) <=
2644 PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002645 copy_characters(string, i, *callresult, 0, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002646 i += size;
Victor Stinner2512a8b2011-03-01 22:46:52 +00002647 Py_DECREF(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002648 }
Victor Stinner2512a8b2011-03-01 22:46:52 +00002649 ++callresult;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002650 break;
2651 }
2652 case 'S':
2653 case 'R':
Victor Stinner9a909002010-10-18 20:59:24 +00002654 case 'A':
Benjamin Peterson14339b62009-01-31 16:36:08 +00002655 {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002656 Py_ssize_t size = PyUnicode_GET_LENGTH(*callresult);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002657 /* unused, since we already have the result */
2658 (void) va_arg(vargs, PyObject *);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002660 copy_characters(string, i, *callresult, 0, size);
2661 i += size;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002662 /* We're done with the unicode()/repr() => forget it */
2663 Py_DECREF(*callresult);
2664 /* switch to next unicode()/repr() result */
2665 ++callresult;
2666 break;
2667 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002668 case '%':
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002669 PyUnicode_WRITE(kind, data, i++, '%');
Benjamin Peterson14339b62009-01-31 16:36:08 +00002670 break;
2671 default:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 for (; *p; ++p, ++i)
2673 PyUnicode_WRITE(kind, data, i, *p);
2674 assert(i == PyUnicode_GET_LENGTH(string));
Benjamin Peterson14339b62009-01-31 16:36:08 +00002675 goto end;
2676 }
Victor Stinner1205f272010-09-11 00:54:47 +00002677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002678 else {
2679 assert(i < PyUnicode_GET_LENGTH(string));
2680 PyUnicode_WRITE(kind, data, i++, *f);
2681 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 assert(i == PyUnicode_GET_LENGTH(string));
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684
Benjamin Peterson29060642009-01-31 22:14:21 +00002685 end:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002686 if (callresults)
2687 PyObject_Free(callresults);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002688 if (numberresults)
2689 PyObject_Free(numberresults);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002690 return unicode_result(string);
Benjamin Peterson29060642009-01-31 22:14:21 +00002691 fail:
Benjamin Peterson14339b62009-01-31 16:36:08 +00002692 if (callresults) {
2693 PyObject **callresult2 = callresults;
2694 while (callresult2 < callresult) {
Victor Stinner2512a8b2011-03-01 22:46:52 +00002695 Py_XDECREF(*callresult2);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002696 ++callresult2;
2697 }
2698 PyObject_Free(callresults);
2699 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 if (numberresults)
2701 PyObject_Free(numberresults);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002703}
2704
Walter Dörwaldd2034312007-05-18 16:29:38 +00002705PyObject *
2706PyUnicode_FromFormat(const char *format, ...)
2707{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002708 PyObject* ret;
2709 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710
2711#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002712 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002713#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002714 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002715#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002716 ret = PyUnicode_FromFormatV(format, vargs);
2717 va_end(vargs);
2718 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002719}
2720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721#ifdef HAVE_WCHAR_H
2722
Victor Stinner5593d8a2010-10-02 11:11:27 +00002723/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2724 convert a Unicode object to a wide character string.
2725
Victor Stinnerd88d9832011-09-06 02:00:05 +02002726 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002727 character) required to convert the unicode object. Ignore size argument.
2728
Victor Stinnerd88d9832011-09-06 02:00:05 +02002729 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002730 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002731 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002732static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002733unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002734 wchar_t *w,
2735 Py_ssize_t size)
2736{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002737 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002738 const wchar_t *wstr;
2739
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002740 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002741 if (wstr == NULL)
2742 return -1;
2743
Victor Stinner5593d8a2010-10-02 11:11:27 +00002744 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002745 if (size > res)
2746 size = res + 1;
2747 else
2748 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002749 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002750 return res;
2751 }
2752 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002753 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002754}
2755
2756Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002757PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002758 wchar_t *w,
2759 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760{
2761 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002762 PyErr_BadInternalCall();
2763 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002765 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002766}
2767
Victor Stinner137c34c2010-09-29 10:25:54 +00002768wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002769PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002770 Py_ssize_t *size)
2771{
2772 wchar_t* buffer;
2773 Py_ssize_t buflen;
2774
2775 if (unicode == NULL) {
2776 PyErr_BadInternalCall();
2777 return NULL;
2778 }
2779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002780 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002781 if (buflen == -1)
2782 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002783 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002784 PyErr_NoMemory();
2785 return NULL;
2786 }
2787
Victor Stinner137c34c2010-09-29 10:25:54 +00002788 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2789 if (buffer == NULL) {
2790 PyErr_NoMemory();
2791 return NULL;
2792 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002793 buflen = unicode_aswidechar(unicode, buffer, buflen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002794 if (buflen == -1)
2795 return NULL;
Victor Stinner5593d8a2010-10-02 11:11:27 +00002796 if (size != NULL)
2797 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002798 return buffer;
2799}
2800
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002801#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802
Alexander Belopolsky40018472011-02-26 01:02:56 +00002803PyObject *
2804PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002806 PyObject *v;
Victor Stinner8faf8212011-12-08 22:14:11 +01002807 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002808 PyErr_SetString(PyExc_ValueError,
2809 "chr() arg not in range(0x110000)");
2810 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002811 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002813 if (ordinal < 256)
2814 return get_latin1_char(ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 v = PyUnicode_New(1, ordinal);
2817 if (v == NULL)
2818 return NULL;
2819 PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002820 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002821 return v;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002822}
2823
Alexander Belopolsky40018472011-02-26 01:02:56 +00002824PyObject *
2825PyUnicode_FromObject(register PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002827 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002828 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002829 if (PyUnicode_CheckExact(obj)) {
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002830 if (PyUnicode_READY(obj))
2831 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002832 Py_INCREF(obj);
2833 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002834 }
2835 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002836 /* For a Unicode subtype that's not a Unicode object,
2837 return a true Unicode object with the same data. */
Victor Stinner2219e0a2011-10-01 01:16:59 +02002838 return PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002839 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002840 PyErr_Format(PyExc_TypeError,
2841 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002842 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002843 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002844}
2845
Alexander Belopolsky40018472011-02-26 01:02:56 +00002846PyObject *
2847PyUnicode_FromEncodedObject(register PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002848 const char *encoding,
2849 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002850{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002851 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002852 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002853
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 PyErr_BadInternalCall();
2856 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002858
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002859 /* Decoding bytes objects is the most common case and should be fast */
2860 if (PyBytes_Check(obj)) {
2861 if (PyBytes_GET_SIZE(obj) == 0) {
2862 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002863 v = unicode_empty;
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002864 }
2865 else {
2866 v = PyUnicode_Decode(
2867 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2868 encoding, errors);
2869 }
2870 return v;
2871 }
2872
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002873 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002874 PyErr_SetString(PyExc_TypeError,
2875 "decoding str is not supported");
2876 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002877 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002878
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002879 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2880 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2881 PyErr_Format(PyExc_TypeError,
2882 "coercing to str: need bytes, bytearray "
2883 "or buffer-like object, %.80s found",
2884 Py_TYPE(obj)->tp_name);
2885 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002886 }
Tim Petersced69f82003-09-16 20:30:58 +00002887
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002888 if (buffer.len == 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002889 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02002890 v = unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Tim Petersced69f82003-09-16 20:30:58 +00002892 else
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002893 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002894
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002895 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002896 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002897}
2898
Victor Stinner600d3be2010-06-10 12:00:55 +00002899/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002900 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2901 1 on success. */
2902static int
2903normalize_encoding(const char *encoding,
2904 char *lower,
2905 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002907 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002908 char *l;
2909 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002910
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002911 if (encoding == NULL) {
2912 strcpy(lower, "utf-8");
2913 return 1;
2914 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002915 e = encoding;
2916 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002917 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002918 while (*e) {
2919 if (l == l_end)
2920 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002921 if (Py_ISUPPER(*e)) {
2922 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002923 }
2924 else if (*e == '_') {
2925 *l++ = '-';
2926 e++;
2927 }
2928 else {
2929 *l++ = *e++;
2930 }
2931 }
2932 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002933 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002934}
2935
Alexander Belopolsky40018472011-02-26 01:02:56 +00002936PyObject *
2937PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002938 Py_ssize_t size,
2939 const char *encoding,
2940 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002941{
2942 PyObject *buffer = NULL, *unicode;
2943 Py_buffer info;
2944 char lower[11]; /* Enough for any encoding shortcut */
2945
Fred Drakee4315f52000-05-09 19:53:39 +00002946 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00002947 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002948 if ((strcmp(lower, "utf-8") == 0) ||
2949 (strcmp(lower, "utf8") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002950 return PyUnicode_DecodeUTF8(s, size, errors);
2951 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002952 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00002953 (strcmp(lower, "iso-8859-1") == 0))
2954 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002955#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002956 else if (strcmp(lower, "mbcs") == 0)
2957 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002958#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002959 else if (strcmp(lower, "ascii") == 0)
2960 return PyUnicode_DecodeASCII(s, size, errors);
2961 else if (strcmp(lower, "utf-16") == 0)
2962 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2963 else if (strcmp(lower, "utf-32") == 0)
2964 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2965 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
2967 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002968 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002969 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002970 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002971 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 if (buffer == NULL)
2973 goto onError;
2974 unicode = PyCodec_Decode(buffer, encoding, errors);
2975 if (unicode == NULL)
2976 goto onError;
2977 if (!PyUnicode_Check(unicode)) {
2978 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00002979 "decoder did not return a str object (type=%.400s)",
Christian Heimes90aa7642007-12-19 02:45:37 +00002980 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 Py_DECREF(unicode);
2982 goto onError;
2983 }
2984 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002985 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00002986
Benjamin Peterson29060642009-01-31 22:14:21 +00002987 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 Py_XDECREF(buffer);
2989 return NULL;
2990}
2991
Alexander Belopolsky40018472011-02-26 01:02:56 +00002992PyObject *
2993PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002994 const char *encoding,
2995 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00002996{
2997 PyObject *v;
2998
2999 if (!PyUnicode_Check(unicode)) {
3000 PyErr_BadArgument();
3001 goto onError;
3002 }
3003
3004 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003005 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003006
3007 /* Decode via the codec registry */
3008 v = PyCodec_Decode(unicode, encoding, errors);
3009 if (v == NULL)
3010 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003011 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003012
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003014 return NULL;
3015}
3016
Alexander Belopolsky40018472011-02-26 01:02:56 +00003017PyObject *
3018PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003019 const char *encoding,
3020 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003021{
3022 PyObject *v;
3023
3024 if (!PyUnicode_Check(unicode)) {
3025 PyErr_BadArgument();
3026 goto onError;
3027 }
3028
3029 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003030 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003031
3032 /* Decode via the codec registry */
3033 v = PyCodec_Decode(unicode, encoding, errors);
3034 if (v == NULL)
3035 goto onError;
3036 if (!PyUnicode_Check(v)) {
3037 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003038 "decoder did not return a str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003039 Py_TYPE(v)->tp_name);
3040 Py_DECREF(v);
3041 goto onError;
3042 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003043 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044
Benjamin Peterson29060642009-01-31 22:14:21 +00003045 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003046 return NULL;
3047}
3048
Alexander Belopolsky40018472011-02-26 01:02:56 +00003049PyObject *
3050PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003051 Py_ssize_t size,
3052 const char *encoding,
3053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003054{
3055 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003056
Guido van Rossumd57fd912000-03-10 22:53:23 +00003057 unicode = PyUnicode_FromUnicode(s, size);
3058 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003060 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3061 Py_DECREF(unicode);
3062 return v;
3063}
3064
Alexander Belopolsky40018472011-02-26 01:02:56 +00003065PyObject *
3066PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003067 const char *encoding,
3068 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003069{
3070 PyObject *v;
3071
3072 if (!PyUnicode_Check(unicode)) {
3073 PyErr_BadArgument();
3074 goto onError;
3075 }
3076
3077 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003078 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003079
3080 /* Encode via the codec registry */
3081 v = PyCodec_Encode(unicode, encoding, errors);
3082 if (v == NULL)
3083 goto onError;
3084 return v;
3085
Benjamin Peterson29060642009-01-31 22:14:21 +00003086 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003087 return NULL;
3088}
3089
Victor Stinnerad158722010-10-27 00:25:46 +00003090PyObject *
3091PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003092{
Victor Stinner99b95382011-07-04 14:23:54 +02003093#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003094 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003095#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003096 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003097#else
Victor Stinner793b5312011-04-27 00:24:21 +02003098 PyInterpreterState *interp = PyThreadState_GET()->interp;
3099 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3100 cannot use it to encode and decode filenames before it is loaded. Load
3101 the Python codec requires to encode at least its own filename. Use the C
3102 version of the locale codec until the codec registry is initialized and
3103 the Python codec is loaded.
3104
3105 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3106 cannot only rely on it: check also interp->fscodec_initialized for
3107 subinterpreters. */
3108 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003109 return PyUnicode_AsEncodedString(unicode,
3110 Py_FileSystemDefaultEncoding,
3111 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003112 }
3113 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003114 /* locale encoding with surrogateescape */
3115 wchar_t *wchar;
3116 char *bytes;
3117 PyObject *bytes_obj;
Victor Stinner2f02a512010-11-08 22:43:46 +00003118 size_t error_pos;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003119
3120 wchar = PyUnicode_AsWideCharString(unicode, NULL);
3121 if (wchar == NULL)
3122 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003123 bytes = _Py_wchar2char(wchar, &error_pos);
3124 if (bytes == NULL) {
3125 if (error_pos != (size_t)-1) {
3126 char *errmsg = strerror(errno);
3127 PyObject *exc = NULL;
3128 if (errmsg == NULL)
3129 errmsg = "Py_wchar2char() failed";
3130 raise_encode_exception(&exc,
Martin v. Löwis12be46c2011-11-04 19:04:15 +01003131 "filesystemencoding", unicode,
Victor Stinner2f02a512010-11-08 22:43:46 +00003132 error_pos, error_pos+1,
3133 errmsg);
3134 Py_XDECREF(exc);
3135 }
3136 else
3137 PyErr_NoMemory();
3138 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003139 return NULL;
Victor Stinner2f02a512010-11-08 22:43:46 +00003140 }
3141 PyMem_Free(wchar);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003142
3143 bytes_obj = PyBytes_FromString(bytes);
3144 PyMem_Free(bytes);
3145 return bytes_obj;
Victor Stinnerc39211f2010-09-29 16:35:47 +00003146 }
Victor Stinnerad158722010-10-27 00:25:46 +00003147#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003148}
3149
Alexander Belopolsky40018472011-02-26 01:02:56 +00003150PyObject *
3151PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003152 const char *encoding,
3153 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154{
3155 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003156 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003157
Guido van Rossumd57fd912000-03-10 22:53:23 +00003158 if (!PyUnicode_Check(unicode)) {
3159 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 }
Fred Drakee4315f52000-05-09 19:53:39 +00003162
Fred Drakee4315f52000-05-09 19:53:39 +00003163 /* Shortcuts for common default encodings */
Victor Stinner37296e82010-06-10 13:36:23 +00003164 if (normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003165 if ((strcmp(lower, "utf-8") == 0) ||
3166 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003167 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003168 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003169 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003170 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003171 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003172 }
Victor Stinner37296e82010-06-10 13:36:23 +00003173 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003174 (strcmp(lower, "latin1") == 0) ||
Victor Stinner37296e82010-06-10 13:36:23 +00003175 (strcmp(lower, "iso-8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003176 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003177#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003178 else if (strcmp(lower, "mbcs") == 0)
3179 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003180#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003181 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003182 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003183 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184
3185 /* Encode via the codec registry */
3186 v = PyCodec_Encode(unicode, encoding, errors);
3187 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003188 return NULL;
3189
3190 /* The normal path */
3191 if (PyBytes_Check(v))
3192 return v;
3193
3194 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003195 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003196 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003197 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003198
3199 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3200 "encoder %s returned bytearray instead of bytes",
3201 encoding);
3202 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003203 Py_DECREF(v);
3204 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003205 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003206
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003207 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3208 Py_DECREF(v);
3209 return b;
3210 }
3211
3212 PyErr_Format(PyExc_TypeError,
3213 "encoder did not return a bytes object (type=%.400s)",
3214 Py_TYPE(v)->tp_name);
3215 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003216 return NULL;
3217}
3218
Alexander Belopolsky40018472011-02-26 01:02:56 +00003219PyObject *
3220PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003221 const char *encoding,
3222 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003223{
3224 PyObject *v;
3225
3226 if (!PyUnicode_Check(unicode)) {
3227 PyErr_BadArgument();
3228 goto onError;
3229 }
3230
3231 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003232 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003233
3234 /* Encode via the codec registry */
3235 v = PyCodec_Encode(unicode, encoding, errors);
3236 if (v == NULL)
3237 goto onError;
3238 if (!PyUnicode_Check(v)) {
3239 PyErr_Format(PyExc_TypeError,
Benjamin Peterson142957c2008-07-04 19:55:29 +00003240 "encoder did not return an str object (type=%.400s)",
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003241 Py_TYPE(v)->tp_name);
3242 Py_DECREF(v);
3243 goto onError;
3244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003246
Benjamin Peterson29060642009-01-31 22:14:21 +00003247 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 return NULL;
3249}
3250
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003251PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003252PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003253 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003254 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3255}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003256
Christian Heimes5894ba72007-11-04 11:43:14 +00003257PyObject*
3258PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3259{
Victor Stinner99b95382011-07-04 14:23:54 +02003260#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003261 return PyUnicode_DecodeMBCS(s, size, NULL);
3262#elif defined(__APPLE__)
3263 return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
3264#else
Victor Stinner793b5312011-04-27 00:24:21 +02003265 PyInterpreterState *interp = PyThreadState_GET()->interp;
3266 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3267 cannot use it to encode and decode filenames before it is loaded. Load
3268 the Python codec requires to encode at least its own filename. Use the C
3269 version of the locale codec until the codec registry is initialized and
3270 the Python codec is loaded.
3271
3272 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3273 cannot only rely on it: check also interp->fscodec_initialized for
3274 subinterpreters. */
3275 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003276 return PyUnicode_Decode(s, size,
3277 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003278 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003279 }
3280 else {
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003281 /* locale encoding with surrogateescape */
3282 wchar_t *wchar;
3283 PyObject *unicode;
Victor Stinner168e1172010-10-16 23:16:16 +00003284 size_t len;
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003285
3286 if (s[size] != '\0' || size != strlen(s)) {
3287 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3288 return NULL;
3289 }
3290
Victor Stinner168e1172010-10-16 23:16:16 +00003291 wchar = _Py_char2wchar(s, &len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003292 if (wchar == NULL)
Victor Stinnerd5af0a52010-11-08 23:34:29 +00003293 return PyErr_NoMemory();
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003294
Victor Stinner168e1172010-10-16 23:16:16 +00003295 unicode = PyUnicode_FromWideChar(wchar, len);
Victor Stinnerf3170cc2010-10-15 12:04:23 +00003296 PyMem_Free(wchar);
3297 return unicode;
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003298 }
Victor Stinnerad158722010-10-27 00:25:46 +00003299#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003300}
3301
Martin v. Löwis011e8422009-05-05 04:43:17 +00003302
3303int
3304PyUnicode_FSConverter(PyObject* arg, void* addr)
3305{
3306 PyObject *output = NULL;
3307 Py_ssize_t size;
3308 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003309 if (arg == NULL) {
3310 Py_DECREF(*(PyObject**)addr);
3311 return 1;
3312 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003313 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003314 output = arg;
3315 Py_INCREF(output);
3316 }
3317 else {
3318 arg = PyUnicode_FromObject(arg);
3319 if (!arg)
3320 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003321 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003322 Py_DECREF(arg);
3323 if (!output)
3324 return 0;
3325 if (!PyBytes_Check(output)) {
3326 Py_DECREF(output);
3327 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3328 return 0;
3329 }
3330 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003331 size = PyBytes_GET_SIZE(output);
3332 data = PyBytes_AS_STRING(output);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003333 if (size != strlen(data)) {
Benjamin Peterson7a6b44a2011-08-18 13:51:47 -05003334 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003335 Py_DECREF(output);
3336 return 0;
3337 }
3338 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003339 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003340}
3341
3342
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003343int
3344PyUnicode_FSDecoder(PyObject* arg, void* addr)
3345{
3346 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003347 if (arg == NULL) {
3348 Py_DECREF(*(PyObject**)addr);
3349 return 1;
3350 }
3351 if (PyUnicode_Check(arg)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 if (PyUnicode_READY(arg))
3353 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003354 output = arg;
3355 Py_INCREF(output);
3356 }
3357 else {
3358 arg = PyBytes_FromObject(arg);
3359 if (!arg)
3360 return 0;
3361 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3362 PyBytes_GET_SIZE(arg));
3363 Py_DECREF(arg);
3364 if (!output)
3365 return 0;
3366 if (!PyUnicode_Check(output)) {
3367 Py_DECREF(output);
3368 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3369 return 0;
3370 }
3371 }
Victor Stinner065836e2011-10-27 01:56:33 +02003372 if (PyUnicode_READY(output) < 0) {
3373 Py_DECREF(output);
3374 return 0;
3375 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003377 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003378 PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3379 Py_DECREF(output);
3380 return 0;
3381 }
3382 *(PyObject**)addr = output;
3383 return Py_CLEANUP_SUPPORTED;
3384}
3385
3386
Martin v. Löwis5b222132007-06-10 09:51:05 +00003387char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003388PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003389{
Christian Heimesf3863112007-11-22 07:46:41 +00003390 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003391
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003392 if (!PyUnicode_Check(unicode)) {
3393 PyErr_BadArgument();
3394 return NULL;
3395 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003396 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003397 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003398
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003399 if (PyUnicode_UTF8(unicode) == NULL) {
3400 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003401 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3402 if (bytes == NULL)
3403 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003404 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3405 if (_PyUnicode_UTF8(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003406 Py_DECREF(bytes);
3407 return NULL;
3408 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003409 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3410 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3411 PyBytes_AS_STRING(bytes),
3412 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003413 Py_DECREF(bytes);
3414 }
3415
3416 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003417 *psize = PyUnicode_UTF8_LENGTH(unicode);
3418 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003419}
3420
3421char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003422PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003423{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003424 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3425}
3426
3427#ifdef Py_DEBUG
Antoine Pitrou53bb5482011-10-10 23:49:24 +02003428static int unicode_as_unicode_calls = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003429#endif
3430
3431
3432Py_UNICODE *
3433PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003435 const unsigned char *one_byte;
3436#if SIZEOF_WCHAR_T == 4
3437 const Py_UCS2 *two_bytes;
3438#else
3439 const Py_UCS4 *four_bytes;
3440 const Py_UCS4 *ucs4_end;
3441 Py_ssize_t num_surrogates;
3442#endif
3443 wchar_t *w;
3444 wchar_t *wchar_end;
3445
3446 if (!PyUnicode_Check(unicode)) {
3447 PyErr_BadArgument();
3448 return NULL;
3449 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003450 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003451 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003452 assert(_PyUnicode_KIND(unicode) != 0);
3453 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003454
3455#ifdef Py_DEBUG
3456 ++unicode_as_unicode_calls;
3457#endif
3458
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003459 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003460#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003461 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3462 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003463 num_surrogates = 0;
3464
3465 for (; four_bytes < ucs4_end; ++four_bytes) {
3466 if (*four_bytes > 0xFFFF)
3467 ++num_surrogates;
3468 }
3469
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003470 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3471 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3472 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003473 PyErr_NoMemory();
3474 return NULL;
3475 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003476 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003477
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003478 w = _PyUnicode_WSTR(unicode);
3479 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3480 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003481 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3482 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003483 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003484 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003485 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3486 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003487 }
3488 else
3489 *w = *four_bytes;
3490
3491 if (w > wchar_end) {
3492 assert(0 && "Miscalculated string end");
3493 }
3494 }
3495 *w = 0;
3496#else
3497 /* sizeof(wchar_t) == 4 */
3498 Py_FatalError("Impossible unicode object state, wstr and str "
3499 "should share memory already.");
3500 return NULL;
3501#endif
3502 }
3503 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003504 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3505 (_PyUnicode_LENGTH(unicode) + 1));
3506 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003507 PyErr_NoMemory();
3508 return NULL;
3509 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003510 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3511 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3512 w = _PyUnicode_WSTR(unicode);
3513 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003514
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003515 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3516 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003517 for (; w < wchar_end; ++one_byte, ++w)
3518 *w = *one_byte;
3519 /* null-terminate the wstr */
3520 *w = 0;
3521 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003522 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003523#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003524 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003525 for (; w < wchar_end; ++two_bytes, ++w)
3526 *w = *two_bytes;
3527 /* null-terminate the wstr */
3528 *w = 0;
3529#else
3530 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003531 PyObject_FREE(_PyUnicode_WSTR(unicode));
3532 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003533 Py_FatalError("Impossible unicode object state, wstr "
3534 "and str should share memory already.");
3535 return NULL;
3536#endif
3537 }
3538 else {
3539 assert(0 && "This should never happen.");
3540 }
3541 }
3542 }
3543 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003544 *size = PyUnicode_WSTR_LENGTH(unicode);
3545 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003546}
3547
Alexander Belopolsky40018472011-02-26 01:02:56 +00003548Py_UNICODE *
3549PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003551 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552}
3553
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003554
Alexander Belopolsky40018472011-02-26 01:02:56 +00003555Py_ssize_t
3556PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557{
3558 if (!PyUnicode_Check(unicode)) {
3559 PyErr_BadArgument();
3560 goto onError;
3561 }
3562 return PyUnicode_GET_SIZE(unicode);
3563
Benjamin Peterson29060642009-01-31 22:14:21 +00003564 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 return -1;
3566}
3567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003568Py_ssize_t
3569PyUnicode_GetLength(PyObject *unicode)
3570{
Victor Stinner5a706cf2011-10-02 00:36:53 +02003571 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003572 PyErr_BadArgument();
3573 return -1;
3574 }
3575
3576 return PyUnicode_GET_LENGTH(unicode);
3577}
3578
3579Py_UCS4
3580PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3581{
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003582 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3583 PyErr_BadArgument();
3584 return (Py_UCS4)-1;
3585 }
3586 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3587 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003588 return (Py_UCS4)-1;
3589 }
3590 return PyUnicode_READ_CHAR(unicode, index);
3591}
3592
3593int
3594PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3595{
3596 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003597 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003598 return -1;
3599 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02003600 if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3601 PyErr_SetString(PyExc_IndexError, "string index out of range");
3602 return -1;
3603 }
3604 if (_PyUnicode_Dirty(unicode))
3605 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003606 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3607 index, ch);
3608 return 0;
3609}
3610
Alexander Belopolsky40018472011-02-26 01:02:56 +00003611const char *
3612PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003613{
Victor Stinner42cb4622010-09-01 19:39:01 +00003614 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003615}
3616
Victor Stinner554f3f02010-06-16 23:33:54 +00003617/* create or adjust a UnicodeDecodeError */
3618static void
3619make_decode_exception(PyObject **exceptionObject,
3620 const char *encoding,
3621 const char *input, Py_ssize_t length,
3622 Py_ssize_t startpos, Py_ssize_t endpos,
3623 const char *reason)
3624{
3625 if (*exceptionObject == NULL) {
3626 *exceptionObject = PyUnicodeDecodeError_Create(
3627 encoding, input, length, startpos, endpos, reason);
3628 }
3629 else {
3630 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3631 goto onError;
3632 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3633 goto onError;
3634 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3635 goto onError;
3636 }
3637 return;
3638
3639onError:
3640 Py_DECREF(*exceptionObject);
3641 *exceptionObject = NULL;
3642}
3643
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644/* error handling callback helper:
3645 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003646 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003647 and adjust various state variables.
3648 return 0 on success, -1 on error
3649*/
3650
Alexander Belopolsky40018472011-02-26 01:02:56 +00003651static int
3652unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003653 const char *encoding, const char *reason,
3654 const char **input, const char **inend, Py_ssize_t *startinpos,
3655 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003656 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003658 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003659
3660 PyObject *restuple = NULL;
3661 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003662 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003663 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003664 Py_ssize_t requiredsize;
3665 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003666 PyObject *inputobj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 int res = -1;
3668
Victor Stinner596a6c42011-11-09 00:02:18 +01003669 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND)
3670 outsize = PyUnicode_GET_LENGTH(*output);
3671 else
3672 outsize = _PyUnicode_WSTR_LENGTH(*output);
3673
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003675 *errorHandler = PyCodec_LookupError(errors);
3676 if (*errorHandler == NULL)
3677 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003678 }
3679
Victor Stinner554f3f02010-06-16 23:33:54 +00003680 make_decode_exception(exceptionObject,
3681 encoding,
3682 *input, *inend - *input,
3683 *startinpos, *endinpos,
3684 reason);
3685 if (*exceptionObject == NULL)
3686 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003687
3688 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3689 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003690 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00003692 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003694 }
3695 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00003696 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003697 if (PyUnicode_READY(repunicode) < 0)
3698 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003699
3700 /* Copy back the bytes variables, which might have been modified by the
3701 callback */
3702 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3703 if (!inputobj)
3704 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00003705 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003706 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00003707 }
Christian Heimes72b710a2008-05-26 13:28:38 +00003708 *input = PyBytes_AS_STRING(inputobj);
3709 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003710 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00003711 /* we can DECREF safely, as the exception has another reference,
3712 so the object won't go away. */
3713 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00003714
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003715 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003717 if (newpos<0 || newpos>insize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003718 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3719 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003720 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003721
Victor Stinner596a6c42011-11-09 00:02:18 +01003722 if (_PyUnicode_KIND(*output) != PyUnicode_WCHAR_KIND) {
3723 /* need more space? (at least enough for what we
3724 have+the replacement+the rest of the string (starting
3725 at the new input position), so we won't have to check space
3726 when there are no errors in the rest of the string) */
3727 Py_ssize_t replen = PyUnicode_GET_LENGTH(repunicode);
3728 requiredsize = *outpos + replen + insize-newpos;
3729 if (requiredsize > outsize) {
3730 if (requiredsize<2*outsize)
3731 requiredsize = 2*outsize;
3732 if (unicode_resize(output, requiredsize) < 0)
3733 goto onError;
3734 }
3735 if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 goto onError;
Victor Stinner596a6c42011-11-09 00:02:18 +01003737 copy_characters(*output, *outpos, repunicode, 0, replen);
3738 *outpos += replen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003739 }
Victor Stinner596a6c42011-11-09 00:02:18 +01003740 else {
3741 wchar_t *repwstr;
3742 Py_ssize_t repwlen;
3743 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
3744 if (repwstr == NULL)
3745 goto onError;
3746 /* need more space? (at least enough for what we
3747 have+the replacement+the rest of the string (starting
3748 at the new input position), so we won't have to check space
3749 when there are no errors in the rest of the string) */
3750 requiredsize = *outpos + repwlen + insize-newpos;
3751 if (requiredsize > outsize) {
3752 if (requiredsize < 2*outsize)
3753 requiredsize = 2*outsize;
3754 if (unicode_resize(output, requiredsize) < 0)
3755 goto onError;
3756 }
3757 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
3758 *outpos += repwlen;
3759 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003760 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003761 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003762
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003763 /* we made it! */
3764 res = 0;
3765
Benjamin Peterson29060642009-01-31 22:14:21 +00003766 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003767 Py_XDECREF(restuple);
3768 return res;
3769}
3770
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003771/* --- UTF-7 Codec -------------------------------------------------------- */
3772
Antoine Pitrou244651a2009-05-04 18:56:13 +00003773/* See RFC2152 for details. We encode conservatively and decode liberally. */
3774
3775/* Three simple macros defining base-64. */
3776
3777/* Is c a base-64 character? */
3778
3779#define IS_BASE64(c) \
3780 (((c) >= 'A' && (c) <= 'Z') || \
3781 ((c) >= 'a' && (c) <= 'z') || \
3782 ((c) >= '0' && (c) <= '9') || \
3783 (c) == '+' || (c) == '/')
3784
3785/* given that c is a base-64 character, what is its base-64 value? */
3786
3787#define FROM_BASE64(c) \
3788 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
3789 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
3790 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
3791 (c) == '+' ? 62 : 63)
3792
3793/* What is the base-64 character of the bottom 6 bits of n? */
3794
3795#define TO_BASE64(n) \
3796 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3797
3798/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3799 * decoded as itself. We are permissive on decoding; the only ASCII
3800 * byte not decoding to itself is the + which begins a base64
3801 * string. */
3802
3803#define DECODE_DIRECT(c) \
3804 ((c) <= 127 && (c) != '+')
3805
3806/* The UTF-7 encoder treats ASCII characters differently according to
3807 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3808 * the above). See RFC2152. This array identifies these different
3809 * sets:
3810 * 0 : "Set D"
3811 * alphanumeric and '(),-./:?
3812 * 1 : "Set O"
3813 * !"#$%&*;<=>@[]^_`{|}
3814 * 2 : "whitespace"
3815 * ht nl cr sp
3816 * 3 : special (must be base64 encoded)
3817 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3818 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003819
Tim Petersced69f82003-09-16 20:30:58 +00003820static
Antoine Pitrou244651a2009-05-04 18:56:13 +00003821char utf7_category[128] = {
3822/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
3823 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
3824/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
3825 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3826/* sp ! " # $ % & ' ( ) * + , - . / */
3827 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
3828/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
3829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
3830/* @ A B C D E F G H I J K L M N O */
3831 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3832/* P Q R S T U V W X Y Z [ \ ] ^ _ */
3833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
3834/* ` a b c d e f g h i j k l m n o */
3835 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3836/* p q r s t u v w x y z { | } ~ del */
3837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003838};
3839
Antoine Pitrou244651a2009-05-04 18:56:13 +00003840/* ENCODE_DIRECT: this character should be encoded as itself. The
3841 * answer depends on whether we are encoding set O as itself, and also
3842 * on whether we are encoding whitespace as itself. RFC2152 makes it
3843 * clear that the answers to these questions vary between
3844 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00003845
Antoine Pitrou244651a2009-05-04 18:56:13 +00003846#define ENCODE_DIRECT(c, directO, directWS) \
3847 ((c) < 128 && (c) > 0 && \
3848 ((utf7_category[(c)] == 0) || \
3849 (directWS && (utf7_category[(c)] == 2)) || \
3850 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003851
Alexander Belopolsky40018472011-02-26 01:02:56 +00003852PyObject *
3853PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003854 Py_ssize_t size,
3855 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003856{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003857 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3858}
3859
Antoine Pitrou244651a2009-05-04 18:56:13 +00003860/* The decoder. The only state we preserve is our read position,
3861 * i.e. how many characters we have consumed. So if we end in the
3862 * middle of a shift sequence we have to back off the read position
3863 * and the output to the beginning of the sequence, otherwise we lose
3864 * all the shift state (seen bits, number of bits seen, high
3865 * surrogate). */
3866
Alexander Belopolsky40018472011-02-26 01:02:56 +00003867PyObject *
3868PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003869 Py_ssize_t size,
3870 const char *errors,
3871 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003872{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003873 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003874 Py_ssize_t startinpos;
3875 Py_ssize_t endinpos;
3876 Py_ssize_t outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003877 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003878 PyObject *unicode;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003879 const char *errmsg = "";
3880 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003881 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003882 unsigned int base64bits = 0;
3883 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01003884 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003885 PyObject *errorHandler = NULL;
3886 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003887
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003888 /* Start off assuming it's all ASCII. Widen later as necessary. */
3889 unicode = PyUnicode_New(size, 127);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003890 if (!unicode)
3891 return NULL;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003892 if (size == 0) {
3893 if (consumed)
3894 *consumed = 0;
Victor Stinner7931d9a2011-11-04 00:22:48 +01003895 return unicode;
Christian Heimes5d14c2b2007-11-20 23:38:09 +00003896 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003897
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003898 shiftOutStart = outpos = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003899 e = s + size;
3900
3901 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003902 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00003903 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00003904 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003905
Antoine Pitrou244651a2009-05-04 18:56:13 +00003906 if (inShift) { /* in a base-64 section */
3907 if (IS_BASE64(ch)) { /* consume a base-64 character */
3908 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3909 base64bits += 6;
3910 s++;
3911 if (base64bits >= 16) {
3912 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01003913 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00003914 base64bits -= 16;
3915 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3916 if (surrogate) {
3917 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01003918 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
3919 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003920 if (unicode_putchar(&unicode, &outpos, ch2) < 0)
3921 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003922 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003923 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003924 }
3925 else {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003926 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3927 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003928 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003929 }
3930 }
Victor Stinner551ac952011-11-29 22:58:13 +01003931 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00003932 /* first surrogate */
3933 surrogate = outCh;
3934 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003935 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003936 if (unicode_putchar(&unicode, &outpos, outCh) < 0)
3937 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003938 }
3939 }
3940 }
3941 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003942 inShift = 0;
3943 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003944 if (surrogate) {
Antoine Pitrou78edf752011-11-15 01:44:16 +01003945 if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
3946 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01003947 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003948 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003949 if (base64bits > 0) { /* left-over bits */
3950 if (base64bits >= 6) {
3951 /* We've seen at least one base-64 character */
3952 errmsg = "partial character in shift sequence";
3953 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003954 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003955 else {
3956 /* Some bits remain; they should be zero */
3957 if (base64buffer != 0) {
3958 errmsg = "non-zero padding bits in shift sequence";
3959 goto utf7Error;
3960 }
3961 }
3962 }
3963 if (ch != '-') {
3964 /* '-' is absorbed; other terminating
3965 characters are preserved */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003966 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3967 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003968 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003969 }
3970 }
3971 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003973 s++; /* consume '+' */
3974 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003975 s++;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003976 if (unicode_putchar(&unicode, &outpos, '+') < 0)
3977 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003978 }
3979 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003980 inShift = 1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003981 shiftOutStart = outpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003982 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003983 }
3984 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003985 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01003986 if (unicode_putchar(&unicode, &outpos, ch) < 0)
3987 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003988 s++;
3989 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00003990 else {
3991 startinpos = s-starts;
3992 s++;
3993 errmsg = "unexpected special character";
3994 goto utf7Error;
3995 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00003996 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00003997utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003998 endinpos = s-starts;
3999 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00004000 errors, &errorHandler,
4001 "utf7", errmsg,
4002 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004003 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004005 }
4006
Antoine Pitrou244651a2009-05-04 18:56:13 +00004007 /* end of string */
4008
4009 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4010 /* if we're in an inconsistent state, that's an error */
4011 if (surrogate ||
4012 (base64bits >= 6) ||
4013 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004014 endinpos = size;
4015 if (unicode_decode_call_errorhandler(
4016 errors, &errorHandler,
4017 "utf7", "unterminated shift sequence",
4018 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004019 &unicode, &outpos))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004020 goto onError;
4021 if (s < e)
4022 goto restart;
4023 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004024 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004025
4026 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004027 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004028 if (inShift) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004029 outpos = shiftOutStart; /* back off output */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004030 *consumed = startinpos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004031 }
4032 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004033 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004034 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004035 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004036
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004037 if (unicode_resize(&unicode, outpos) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004038 goto onError;
4039
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004040 Py_XDECREF(errorHandler);
4041 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01004042 return unicode_result(unicode);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004043
Benjamin Peterson29060642009-01-31 22:14:21 +00004044 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004045 Py_XDECREF(errorHandler);
4046 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004047 Py_DECREF(unicode);
4048 return NULL;
4049}
4050
4051
Alexander Belopolsky40018472011-02-26 01:02:56 +00004052PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004053_PyUnicode_EncodeUTF7(PyObject *str,
4054 int base64SetO,
4055 int base64WhiteSpace,
4056 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004057{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004058 int kind;
4059 void *data;
4060 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004061 PyObject *v;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004062 Py_ssize_t allocated;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004063 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004064 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004065 unsigned int base64bits = 0;
4066 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004067 char * out;
4068 char * start;
4069
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004070 if (PyUnicode_READY(str) < 0)
4071 return NULL;
4072 kind = PyUnicode_KIND(str);
4073 data = PyUnicode_DATA(str);
4074 len = PyUnicode_GET_LENGTH(str);
4075
4076 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004077 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004078
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004079 /* It might be possible to tighten this worst case */
4080 allocated = 8 * len;
4081 if (allocated / 8 != len)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004082 return PyErr_NoMemory();
4083
Antoine Pitrou244651a2009-05-04 18:56:13 +00004084 v = PyBytes_FromStringAndSize(NULL, allocated);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004085 if (v == NULL)
4086 return NULL;
4087
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004088 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004089 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004090 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004091
Antoine Pitrou244651a2009-05-04 18:56:13 +00004092 if (inShift) {
4093 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4094 /* shifting out */
4095 if (base64bits) { /* output remaining bits */
4096 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4097 base64buffer = 0;
4098 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004099 }
4100 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004101 /* Characters not in the BASE64 set implicitly unshift the sequence
4102 so no '-' is required, except if the character is itself a '-' */
4103 if (IS_BASE64(ch) || ch == '-') {
4104 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004105 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004106 *out++ = (char) ch;
4107 }
4108 else {
4109 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004110 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004111 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004112 else { /* not in a shift sequence */
4113 if (ch == '+') {
4114 *out++ = '+';
4115 *out++ = '-';
4116 }
4117 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4118 *out++ = (char) ch;
4119 }
4120 else {
4121 *out++ = '+';
4122 inShift = 1;
4123 goto encode_char;
4124 }
4125 }
4126 continue;
4127encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004128 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004129 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004130
Antoine Pitrou244651a2009-05-04 18:56:13 +00004131 /* code first surrogate */
4132 base64bits += 16;
4133 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
4134 while (base64bits >= 6) {
4135 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4136 base64bits -= 6;
4137 }
4138 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004139 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004140 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004141 base64bits += 16;
4142 base64buffer = (base64buffer << 16) | ch;
4143 while (base64bits >= 6) {
4144 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4145 base64bits -= 6;
4146 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004147 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004148 if (base64bits)
4149 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4150 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004151 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004152 if (_PyBytes_Resize(&v, out - start) < 0)
4153 return NULL;
4154 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004155}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004156PyObject *
4157PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4158 Py_ssize_t size,
4159 int base64SetO,
4160 int base64WhiteSpace,
4161 const char *errors)
4162{
4163 PyObject *result;
4164 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4165 if (tmp == NULL)
4166 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004167 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004168 base64WhiteSpace, errors);
4169 Py_DECREF(tmp);
4170 return result;
4171}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004172
Antoine Pitrou244651a2009-05-04 18:56:13 +00004173#undef IS_BASE64
4174#undef FROM_BASE64
4175#undef TO_BASE64
4176#undef DECODE_DIRECT
4177#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179/* --- UTF-8 Codec -------------------------------------------------------- */
4180
Tim Petersced69f82003-09-16 20:30:58 +00004181static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182char utf8_code_length[256] = {
Ezio Melotti57221d02010-07-01 07:32:02 +00004183 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
4184 illegal prefix. See RFC 3629 for details */
4185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
4186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Victor Stinner4a2b7a12010-08-13 14:03:48 +00004187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4189 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4190 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
4191 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
Ezio Melotti57221d02010-07-01 07:32:02 +00004192 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
4193 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4195 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
Ezio Melotti57221d02010-07-01 07:32:02 +00004196 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
4197 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
4198 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
4199 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
4200 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004201};
4202
Alexander Belopolsky40018472011-02-26 01:02:56 +00004203PyObject *
4204PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004205 Py_ssize_t size,
4206 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207{
Walter Dörwald69652032004-09-07 20:24:22 +00004208 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4209}
4210
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004211#include "stringlib/ucs1lib.h"
4212#include "stringlib/codecs.h"
4213#include "stringlib/undef.h"
4214
4215#include "stringlib/ucs2lib.h"
4216#include "stringlib/codecs.h"
4217#include "stringlib/undef.h"
4218
4219#include "stringlib/ucs4lib.h"
4220#include "stringlib/codecs.h"
4221#include "stringlib/undef.h"
4222
Antoine Pitrouab868312009-01-10 15:40:25 +00004223/* Mask to check or force alignment of a pointer to C 'long' boundaries */
4224#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
4225
4226/* Mask to quickly check whether a C 'long' contains a
4227 non-ASCII, UTF8-encoded char. */
4228#if (SIZEOF_LONG == 8)
4229# define ASCII_CHAR_MASK 0x8080808080808080L
4230#elif (SIZEOF_LONG == 4)
4231# define ASCII_CHAR_MASK 0x80808080L
4232#else
4233# error C 'long' size should be either 4 or 8!
4234#endif
4235
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004236/* Scans a UTF-8 string and returns the maximum character to be expected
4237 and the size of the decoded unicode string.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004239 This function doesn't check for errors, these checks are performed in
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004240 PyUnicode_DecodeUTF8Stateful.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004241 */
4242static Py_UCS4
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004243utf8_max_char_size_and_char_count(const char *s, Py_ssize_t string_size,
4244 Py_ssize_t *unicode_size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004245{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246 Py_ssize_t char_count = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004247 const unsigned char *p = (const unsigned char *)s;
4248 const unsigned char *end = p + string_size;
4249 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004250
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004251 assert(unicode_size != NULL);
4252
4253 /* By having a cascade of independent loops which fallback onto each
4254 other, we minimize the amount of work done in the average loop
4255 iteration, and we also maximize the CPU's ability to predict
4256 branches correctly (because a given condition will have always the
4257 same boolean outcome except perhaps in the last iteration of the
4258 corresponding loop).
4259 In the general case this brings us rather close to decoding
4260 performance pre-PEP 393, despite the two-pass decoding.
4261
4262 Note that the pure ASCII loop is not duplicated once a non-ASCII
4263 character has been encountered. It is actually a pessimization (by
4264 a significant factor) to use this loop on text with many non-ASCII
4265 characters, and it is important to avoid bad performance on valid
4266 utf-8 data (invalid utf-8 being a different can of worms).
4267 */
4268
4269 /* ASCII */
4270 for (; p < end; ++p) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004271 /* Only check value if it's not a ASCII char... */
4272 if (*p < 0x80) {
4273 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
4274 an explanation. */
4275 if (!((size_t) p & LONG_PTR_MASK)) {
4276 /* Help register allocation */
4277 register const unsigned char *_p = p;
4278 while (_p < aligned_end) {
4279 unsigned long value = *(unsigned long *) _p;
4280 if (value & ASCII_CHAR_MASK)
4281 break;
4282 _p += SIZEOF_LONG;
4283 char_count += SIZEOF_LONG;
4284 }
4285 p = _p;
4286 if (p == end)
4287 break;
4288 }
4289 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004290 if (*p < 0x80)
4291 ++char_count;
4292 else
4293 goto _ucs1loop;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 }
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004295 *unicode_size = char_count;
4296 return 127;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004297
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004298_ucs1loop:
4299 for (; p < end; ++p) {
4300 if (*p < 0xc4)
4301 char_count += ((*p & 0xc0) != 0x80);
4302 else
4303 goto _ucs2loop;
4304 }
4305 *unicode_size = char_count;
4306 return 255;
4307
4308_ucs2loop:
4309 for (; p < end; ++p) {
4310 if (*p < 0xf0)
4311 char_count += ((*p & 0xc0) != 0x80);
4312 else
4313 goto _ucs4loop;
4314 }
4315 *unicode_size = char_count;
4316 return 65535;
4317
4318_ucs4loop:
4319 for (; p < end; ++p) {
4320 char_count += ((*p & 0xc0) != 0x80);
4321 }
4322 *unicode_size = char_count;
4323 return 65537;
4324}
4325
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004326/* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
Victor Stinner785938e2011-12-11 20:09:03 +01004327 in case of errors. Implicit parameters: unicode, kind, data, onError.
4328 Potential resizing overallocates, so the result needs to shrink at the end.
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004329*/
Victor Stinner785938e2011-12-11 20:09:03 +01004330#define WRITE_MAYBE_FAIL(index, value) \
4331 do { \
4332 Py_ssize_t pos = index; \
4333 if (pos > PyUnicode_GET_LENGTH(unicode) && \
4334 unicode_resize(&unicode, pos + pos/8) < 0) \
4335 goto onError; \
4336 if (unicode_putchar(&unicode, &pos, value) < 0) \
4337 goto onError; \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004338 } while (0)
4339
Alexander Belopolsky40018472011-02-26 01:02:56 +00004340PyObject *
Victor Stinner785938e2011-12-11 20:09:03 +01004341decode_utf8_errors(const char *starts,
4342 Py_ssize_t size,
4343 const char *errors,
4344 Py_ssize_t *consumed,
4345 const char *s,
4346 PyObject *unicode,
4347 Py_ssize_t i)
Walter Dörwald69652032004-09-07 20:24:22 +00004348{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004349 int n;
Ezio Melotti57221d02010-07-01 07:32:02 +00004350 int k;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004351 Py_ssize_t startinpos;
4352 Py_ssize_t endinpos;
Victor Stinner785938e2011-12-11 20:09:03 +01004353 const char *e = starts + size;
4354 const char *aligned_end;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004355 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004356 PyObject *errorHandler = NULL;
4357 PyObject *exc = NULL;
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004358
Antoine Pitrouab868312009-01-10 15:40:25 +00004359 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360
4361 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004362 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363
4364 if (ch < 0x80) {
Antoine Pitrouab868312009-01-10 15:40:25 +00004365 /* Fast path for runs of ASCII characters. Given that common UTF-8
4366 input will consist of an overwhelming majority of ASCII
4367 characters, we try to optimize for this case by checking
4368 as many characters as a C 'long' can contain.
4369 First, check if we can do an aligned read, as most CPUs have
4370 a penalty for unaligned reads.
4371 */
4372 if (!((size_t) s & LONG_PTR_MASK)) {
4373 /* Help register allocation */
4374 register const char *_s = s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004375 register Py_ssize_t _i = i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004376 while (_s < aligned_end) {
4377 /* Read a whole long at a time (either 4 or 8 bytes),
4378 and do a fast unrolled copy if it only contains ASCII
4379 characters. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004380 unsigned long value = *(unsigned long *) _s;
4381 if (value & ASCII_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00004382 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004383 WRITE_MAYBE_FAIL(_i+0, _s[0]);
4384 WRITE_MAYBE_FAIL(_i+1, _s[1]);
4385 WRITE_MAYBE_FAIL(_i+2, _s[2]);
4386 WRITE_MAYBE_FAIL(_i+3, _s[3]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004387#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004388 WRITE_MAYBE_FAIL(_i+4, _s[4]);
4389 WRITE_MAYBE_FAIL(_i+5, _s[5]);
4390 WRITE_MAYBE_FAIL(_i+6, _s[6]);
4391 WRITE_MAYBE_FAIL(_i+7, _s[7]);
Antoine Pitrouab868312009-01-10 15:40:25 +00004392#endif
4393 _s += SIZEOF_LONG;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004394 _i += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00004395 }
4396 s = _s;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004397 i = _i;
Antoine Pitrouab868312009-01-10 15:40:25 +00004398 if (s == e)
4399 break;
4400 ch = (unsigned char)*s;
4401 }
4402 }
4403
4404 if (ch < 0x80) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004405 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406 s++;
4407 continue;
4408 }
4409
4410 n = utf8_code_length[ch];
4411
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004412 if (s + n > e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004413 if (consumed)
4414 break;
4415 else {
4416 errmsg = "unexpected end of data";
4417 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004418 endinpos = startinpos+1;
4419 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4420 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 goto utf8Error;
4422 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00004423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424
4425 switch (n) {
4426
4427 case 0:
Ezio Melotti57221d02010-07-01 07:32:02 +00004428 errmsg = "invalid start byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004429 startinpos = s-starts;
4430 endinpos = startinpos+1;
4431 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432
4433 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004434 errmsg = "internal error";
Benjamin Peterson29060642009-01-31 22:14:21 +00004435 startinpos = s-starts;
4436 endinpos = startinpos+1;
4437 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438
4439 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00004440 if ((s[1] & 0xc0) != 0x80) {
Ezio Melotti57221d02010-07-01 07:32:02 +00004441 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004443 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00004444 goto utf8Error;
4445 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004447 assert ((ch > 0x007F) && (ch <= 0x07FF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004448 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449 break;
4450
4451 case 3:
Ezio Melotti9bf2b3a2010-07-03 04:52:19 +00004452 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4453 will result in surrogates in range d800-dfff. Surrogates are
4454 not valid UTF-8 so they are rejected.
4455 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4456 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
Tim Petersced69f82003-09-16 20:30:58 +00004457 if ((s[1] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004458 (s[2] & 0xc0) != 0x80 ||
4459 ((unsigned char)s[0] == 0xE0 &&
4460 (unsigned char)s[1] < 0xA0) ||
4461 ((unsigned char)s[0] == 0xED &&
4462 (unsigned char)s[1] > 0x9F)) {
4463 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004464 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004465 endinpos = startinpos + 1;
4466
4467 /* if s[1] first two bits are 1 and 0, then the invalid
4468 continuation byte is s[2], so increment endinpos by 1,
4469 if not, s[1] is invalid and endinpos doesn't need to
4470 be incremented. */
4471 if ((s[1] & 0xC0) == 0x80)
4472 endinpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00004473 goto utf8Error;
4474 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Ezio Melotti57221d02010-07-01 07:32:02 +00004476 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004477 WRITE_MAYBE_FAIL(i++, ch);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004478 break;
4479
4480 case 4:
4481 if ((s[1] & 0xc0) != 0x80 ||
4482 (s[2] & 0xc0) != 0x80 ||
Ezio Melotti57221d02010-07-01 07:32:02 +00004483 (s[3] & 0xc0) != 0x80 ||
4484 ((unsigned char)s[0] == 0xF0 &&
4485 (unsigned char)s[1] < 0x90) ||
4486 ((unsigned char)s[0] == 0xF4 &&
4487 (unsigned char)s[1] > 0x8F)) {
4488 errmsg = "invalid continuation byte";
Benjamin Peterson29060642009-01-31 22:14:21 +00004489 startinpos = s-starts;
Ezio Melotti57221d02010-07-01 07:32:02 +00004490 endinpos = startinpos + 1;
4491 if ((s[1] & 0xC0) == 0x80) {
4492 endinpos++;
4493 if ((s[2] & 0xC0) == 0x80)
4494 endinpos++;
4495 }
Benjamin Peterson29060642009-01-31 22:14:21 +00004496 goto utf8Error;
4497 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004498 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
Ezio Melotti57221d02010-07-01 07:32:02 +00004499 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004500 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Ezio Melotti57221d02010-07-01 07:32:02 +00004501
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004502 WRITE_MAYBE_FAIL(i++, ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004503 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004504 }
4505 s += n;
Benjamin Peterson29060642009-01-31 22:14:21 +00004506 continue;
Tim Petersced69f82003-09-16 20:30:58 +00004507
Benjamin Peterson29060642009-01-31 22:14:21 +00004508 utf8Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00004509 if (unicode_decode_call_errorhandler(
4510 errors, &errorHandler,
4511 "utf8", errmsg,
4512 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004513 &unicode, &i))
Benjamin Peterson29060642009-01-31 22:14:21 +00004514 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004515 /* Update data because unicode_decode_call_errorhandler might have
4516 re-created or resized the unicode object. */
Benjamin Peterson29060642009-01-31 22:14:21 +00004517 aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004518 }
Walter Dörwald69652032004-09-07 20:24:22 +00004519 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00004520 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004522 /* Adjust length and ready string when it contained errors and
4523 is of the old resizable kind. */
Victor Stinner785938e2011-12-11 20:09:03 +01004524 if (unicode_resize(&unicode, i) < 0)
4525 goto onError;
4526 unicode_adjust_maxchar(&unicode);
4527 if (unicode == NULL)
4528 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004529
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004530 Py_XDECREF(errorHandler);
4531 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02004532 assert(_PyUnicode_CheckConsistency(unicode, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01004533 return unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004534
Benjamin Peterson29060642009-01-31 22:14:21 +00004535 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004536 Py_XDECREF(errorHandler);
4537 Py_XDECREF(exc);
Victor Stinner785938e2011-12-11 20:09:03 +01004538 Py_XDECREF(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004539 return NULL;
4540}
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004541#undef WRITE_MAYBE_FAIL
Antoine Pitrouab868312009-01-10 15:40:25 +00004542
Victor Stinner785938e2011-12-11 20:09:03 +01004543PyObject *
4544PyUnicode_DecodeUTF8Stateful(const char *s,
4545 Py_ssize_t size,
4546 const char *errors,
4547 Py_ssize_t *consumed)
4548{
4549 Py_UCS4 maxchar = 0;
4550 Py_ssize_t unicode_size;
4551 int has_errors = 0;
4552 PyObject *unicode;
4553 int kind;
4554 void *data;
4555 const char *starts = s;
4556 const char *e;
4557 Py_ssize_t i;
4558
4559 if (size == 0) {
4560 if (consumed)
4561 *consumed = 0;
Victor Stinner382955f2011-12-11 21:44:00 +01004562 Py_INCREF(unicode_empty);
4563 return unicode_empty;
Victor Stinner785938e2011-12-11 20:09:03 +01004564 }
4565
4566 maxchar = utf8_max_char_size_and_char_count(s, size, &unicode_size);
4567
4568 /* When the string is ASCII only, just use memcpy and return.
4569 unicode_size may be != size if there is an incomplete UTF-8
4570 sequence at the end of the ASCII block. */
4571 if (maxchar < 128 && size == unicode_size) {
4572 if (consumed)
4573 *consumed = size;
4574 return unicode_fromascii(s, size);
4575 }
4576
4577 unicode = PyUnicode_New(unicode_size, maxchar);
4578 if (!unicode)
4579 return NULL;
4580 kind = PyUnicode_KIND(unicode);
4581 data = PyUnicode_DATA(unicode);
4582
4583 /* Unpack UTF-8 encoded data */
4584 i = 0;
4585 e = starts + size;
4586 switch (kind) {
4587 case PyUnicode_1BYTE_KIND:
4588 has_errors = ucs1lib_utf8_try_decode(s, e, (Py_UCS1 *) data, &s, &i);
4589 break;
4590 case PyUnicode_2BYTE_KIND:
4591 has_errors = ucs2lib_utf8_try_decode(s, e, (Py_UCS2 *) data, &s, &i);
4592 break;
4593 case PyUnicode_4BYTE_KIND:
4594 has_errors = ucs4lib_utf8_try_decode(s, e, (Py_UCS4 *) data, &s, &i);
4595 break;
4596 }
4597 if (!has_errors) {
4598 /* Ensure the unicode size calculation was correct */
4599 assert(i == unicode_size);
4600 assert(s == e);
4601 if (consumed)
4602 *consumed = size;
4603 return unicode;
4604 }
4605
4606 /* In case of errors, maxchar and size computation might be incorrect;
4607 code below refits and resizes as necessary. */
4608 return decode_utf8_errors(starts, size, errors, consumed, s, unicode, i);
4609}
4610
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004611#ifdef __APPLE__
4612
4613/* Simplified UTF-8 decoder using surrogateescape error handler,
4614 used to decode the command line arguments on Mac OS X. */
4615
4616wchar_t*
4617_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4618{
4619 int n;
4620 const char *e;
4621 wchar_t *unicode, *p;
4622
4623 /* Note: size will always be longer than the resulting Unicode
4624 character count */
4625 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4626 PyErr_NoMemory();
4627 return NULL;
4628 }
4629 unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4630 if (!unicode)
4631 return NULL;
4632
4633 /* Unpack UTF-8 encoded data */
4634 p = unicode;
4635 e = s + size;
4636 while (s < e) {
4637 Py_UCS4 ch = (unsigned char)*s;
4638
4639 if (ch < 0x80) {
4640 *p++ = (wchar_t)ch;
4641 s++;
4642 continue;
4643 }
4644
4645 n = utf8_code_length[ch];
4646 if (s + n > e) {
4647 goto surrogateescape;
4648 }
4649
4650 switch (n) {
4651 case 0:
4652 case 1:
4653 goto surrogateescape;
4654
4655 case 2:
4656 if ((s[1] & 0xc0) != 0x80)
4657 goto surrogateescape;
4658 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4659 assert ((ch > 0x007F) && (ch <= 0x07FF));
4660 *p++ = (wchar_t)ch;
4661 break;
4662
4663 case 3:
4664 /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4665 will result in surrogates in range d800-dfff. Surrogates are
4666 not valid UTF-8 so they are rejected.
4667 See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4668 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4669 if ((s[1] & 0xc0) != 0x80 ||
4670 (s[2] & 0xc0) != 0x80 ||
4671 ((unsigned char)s[0] == 0xE0 &&
4672 (unsigned char)s[1] < 0xA0) ||
4673 ((unsigned char)s[0] == 0xED &&
4674 (unsigned char)s[1] > 0x9F)) {
4675
4676 goto surrogateescape;
4677 }
4678 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4679 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004680 *p++ = (wchar_t)ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004681 break;
4682
4683 case 4:
4684 if ((s[1] & 0xc0) != 0x80 ||
4685 (s[2] & 0xc0) != 0x80 ||
4686 (s[3] & 0xc0) != 0x80 ||
4687 ((unsigned char)s[0] == 0xF0 &&
4688 (unsigned char)s[1] < 0x90) ||
4689 ((unsigned char)s[0] == 0xF4 &&
4690 (unsigned char)s[1] > 0x8F)) {
4691 goto surrogateescape;
4692 }
4693 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4694 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
Victor Stinner8faf8212011-12-08 22:14:11 +01004695 assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004696
4697#if SIZEOF_WCHAR_T == 4
4698 *p++ = (wchar_t)ch;
4699#else
4700 /* compute and append the two surrogates: */
Victor Stinner551ac952011-11-29 22:58:13 +01004701 *p++ = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4702 *p++ = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004703#endif
4704 break;
4705 }
4706 s += n;
4707 continue;
4708
4709 surrogateescape:
4710 *p++ = 0xDC00 + ch;
4711 s++;
4712 }
4713 *p = L'\0';
4714 return unicode;
4715}
4716
4717#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004719/* Primary internal function which creates utf8 encoded bytes objects.
4720
4721 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004722 and allocate exactly as much space needed at the end. Else allocate the
4723 maximum possible needed (4 result bytes per Unicode character), and return
4724 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004725*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004726PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004727_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728{
Tim Peters602f7402002-04-27 18:03:26 +00004729#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00004730
Guido van Rossum98297ee2007-11-06 21:34:58 +00004731 Py_ssize_t i; /* index into s of next input byte */
4732 PyObject *result; /* result string object */
4733 char *p; /* next free byte in output buffer */
4734 Py_ssize_t nallocated; /* number of result bytes allocated */
4735 Py_ssize_t nneeded; /* number of result bytes needed */
Tim Peters602f7402002-04-27 18:03:26 +00004736 char stackbuf[MAX_SHORT_UNICHARS * 4];
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004737 PyObject *errorHandler = NULL;
4738 PyObject *exc = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004739 int kind;
4740 void *data;
4741 Py_ssize_t size;
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004742 PyObject *rep = NULL;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004743
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004744 if (!PyUnicode_Check(unicode)) {
4745 PyErr_BadArgument();
4746 return NULL;
4747 }
4748
4749 if (PyUnicode_READY(unicode) == -1)
4750 return NULL;
4751
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004752 if (PyUnicode_UTF8(unicode))
4753 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4754 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004755
4756 kind = PyUnicode_KIND(unicode);
4757 data = PyUnicode_DATA(unicode);
4758 size = PyUnicode_GET_LENGTH(unicode);
4759
Tim Peters602f7402002-04-27 18:03:26 +00004760 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761
Tim Peters602f7402002-04-27 18:03:26 +00004762 if (size <= MAX_SHORT_UNICHARS) {
4763 /* Write into the stack buffer; nallocated can't overflow.
4764 * At the end, we'll allocate exactly as much heap space as it
4765 * turns out we need.
4766 */
4767 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004768 result = NULL; /* will allocate after we're done */
Tim Peters602f7402002-04-27 18:03:26 +00004769 p = stackbuf;
4770 }
4771 else {
4772 /* Overallocate on the heap, and give the excess back at the end. */
4773 nallocated = size * 4;
4774 if (nallocated / 4 != size) /* overflow! */
4775 return PyErr_NoMemory();
Christian Heimes72b710a2008-05-26 13:28:38 +00004776 result = PyBytes_FromStringAndSize(NULL, nallocated);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004777 if (result == NULL)
Tim Peters602f7402002-04-27 18:03:26 +00004778 return NULL;
Christian Heimes72b710a2008-05-26 13:28:38 +00004779 p = PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004780 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004781
Tim Peters602f7402002-04-27 18:03:26 +00004782 for (i = 0; i < size;) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004783 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004784
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004785 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00004786 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00004788
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00004790 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00004791 *p++ = (char)(0xc0 | (ch >> 6));
4792 *p++ = (char)(0x80 | (ch & 0x3f));
Victor Stinner551ac952011-11-29 22:58:13 +01004793 } else if (Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004794 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004795 Py_ssize_t repsize, k, startpos;
4796 startpos = i-1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004797 rep = unicode_encode_call_errorhandler(
4798 errors, &errorHandler, "utf-8", "surrogates not allowed",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004799 unicode, &exc, startpos, startpos+1, &newpos);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004800 if (!rep)
4801 goto error;
Victor Stinner31be90b2010-04-22 19:38:16 +00004802
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004803 if (PyBytes_Check(rep))
4804 repsize = PyBytes_GET_SIZE(rep);
4805 else
Victor Stinner9e30aa52011-11-21 02:49:52 +01004806 repsize = PyUnicode_GET_LENGTH(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004807
4808 if (repsize > 4) {
4809 Py_ssize_t offset;
4810
4811 if (result == NULL)
4812 offset = p - stackbuf;
Victor Stinner31be90b2010-04-22 19:38:16 +00004813 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004814 offset = p - PyBytes_AS_STRING(result);
Victor Stinner31be90b2010-04-22 19:38:16 +00004815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816 if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4817 /* integer overflow */
4818 PyErr_NoMemory();
4819 goto error;
4820 }
4821 nallocated += repsize - 4;
4822 if (result != NULL) {
4823 if (_PyBytes_Resize(&result, nallocated) < 0)
4824 goto error;
4825 } else {
4826 result = PyBytes_FromStringAndSize(NULL, nallocated);
Victor Stinner31be90b2010-04-22 19:38:16 +00004827 if (result == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004828 goto error;
4829 Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4830 }
4831 p = PyBytes_AS_STRING(result) + offset;
4832 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834 if (PyBytes_Check(rep)) {
4835 char *prep = PyBytes_AS_STRING(rep);
4836 for(k = repsize; k > 0; k--)
4837 *p++ = *prep++;
4838 } else /* rep is unicode */ {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004839 enum PyUnicode_Kind repkind;
4840 void *repdata;
4841
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004842 if (PyUnicode_READY(rep) < 0)
Victor Stinnera98b28c2011-11-10 20:21:49 +01004843 goto error;
Victor Stinnera98b28c2011-11-10 20:21:49 +01004844 repkind = PyUnicode_KIND(rep);
4845 repdata = PyUnicode_DATA(rep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004846
4847 for(k=0; k<repsize; k++) {
Victor Stinnera98b28c2011-11-10 20:21:49 +01004848 Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004849 if (0x80 <= c) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01004850 raise_encode_exception(&exc, "utf-8",
Victor Stinner7931d9a2011-11-04 00:22:48 +01004851 unicode,
Martin v. Löwis9e816682011-11-02 12:45:42 +01004852 i-1, i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004853 "surrogates not allowed");
Victor Stinner31be90b2010-04-22 19:38:16 +00004854 goto error;
4855 }
Victor Stinnera98b28c2011-11-10 20:21:49 +01004856 *p++ = (char)c;
Victor Stinner31be90b2010-04-22 19:38:16 +00004857 }
Victor Stinner31be90b2010-04-22 19:38:16 +00004858 }
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004859 Py_CLEAR(rep);
Victor Stinner31be90b2010-04-22 19:38:16 +00004860 } else if (ch < 0x10000) {
4861 *p++ = (char)(0xe0 | (ch >> 12));
4862 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4863 *p++ = (char)(0x80 | (ch & 0x3f));
4864 } else /* ch >= 0x10000 */ {
Victor Stinner8faf8212011-12-08 22:14:11 +01004865 assert(ch <= MAX_UNICODE);
Tim Peters602f7402002-04-27 18:03:26 +00004866 /* Encode UCS4 Unicode ordinals */
4867 *p++ = (char)(0xf0 | (ch >> 18));
4868 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4869 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4870 *p++ = (char)(0x80 | (ch & 0x3f));
4871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872 }
Tim Peters0eca65c2002-04-21 17:28:06 +00004873
Guido van Rossum98297ee2007-11-06 21:34:58 +00004874 if (result == NULL) {
Tim Peters602f7402002-04-27 18:03:26 +00004875 /* This was stack allocated. */
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00004876 nneeded = p - stackbuf;
Tim Peters602f7402002-04-27 18:03:26 +00004877 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004878 result = PyBytes_FromStringAndSize(stackbuf, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004879 }
4880 else {
Christian Heimesf3863112007-11-22 07:46:41 +00004881 /* Cut back to size actually needed. */
Christian Heimes72b710a2008-05-26 13:28:38 +00004882 nneeded = p - PyBytes_AS_STRING(result);
Tim Peters602f7402002-04-27 18:03:26 +00004883 assert(nneeded <= nallocated);
Christian Heimes72b710a2008-05-26 13:28:38 +00004884 _PyBytes_Resize(&result, nneeded);
Tim Peters602f7402002-04-27 18:03:26 +00004885 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004886
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004887 Py_XDECREF(errorHandler);
4888 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00004889 return result;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004890 error:
Antoine Pitrou31b92a52011-11-12 18:35:19 +01004891 Py_XDECREF(rep);
Martin v. Löwisdb12d452009-05-02 18:52:14 +00004892 Py_XDECREF(errorHandler);
4893 Py_XDECREF(exc);
4894 Py_XDECREF(result);
4895 return NULL;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004896
Tim Peters602f7402002-04-27 18:03:26 +00004897#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898}
4899
Alexander Belopolsky40018472011-02-26 01:02:56 +00004900PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004901PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4902 Py_ssize_t size,
4903 const char *errors)
4904{
4905 PyObject *v, *unicode;
4906
4907 unicode = PyUnicode_FromUnicode(s, size);
4908 if (unicode == NULL)
4909 return NULL;
4910 v = _PyUnicode_AsUTF8String(unicode, errors);
4911 Py_DECREF(unicode);
4912 return v;
4913}
4914
4915PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004916PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004918 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919}
4920
Walter Dörwald41980ca2007-08-16 21:55:45 +00004921/* --- UTF-32 Codec ------------------------------------------------------- */
4922
4923PyObject *
4924PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 Py_ssize_t size,
4926 const char *errors,
4927 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928{
4929 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4930}
4931
4932PyObject *
4933PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004934 Py_ssize_t size,
4935 const char *errors,
4936 int *byteorder,
4937 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938{
4939 const char *starts = s;
4940 Py_ssize_t startinpos;
4941 Py_ssize_t endinpos;
4942 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01004943 PyObject *unicode;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004944 const unsigned char *q, *e;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004945 int bo = 0; /* assume native ordering by default */
4946 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004947 /* Offsets from q for retrieving bytes in the right order. */
4948#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4949 int iorder[] = {0, 1, 2, 3};
4950#else
4951 int iorder[] = {3, 2, 1, 0};
4952#endif
4953 PyObject *errorHandler = NULL;
4954 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004955
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 q = (unsigned char *)s;
4957 e = q + size;
4958
4959 if (byteorder)
4960 bo = *byteorder;
4961
4962 /* Check for BOM marks (U+FEFF) in the input and adjust current
4963 byte order setting accordingly. In native mode, the leading BOM
4964 mark is skipped, in all other modes, it is copied to the output
4965 stream as-is (giving a ZWNBSP character). */
4966 if (bo == 0) {
4967 if (size >= 4) {
4968 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
Benjamin Peterson29060642009-01-31 22:14:21 +00004969 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00004970#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00004971 if (bom == 0x0000FEFF) {
4972 q += 4;
4973 bo = -1;
4974 }
4975 else if (bom == 0xFFFE0000) {
4976 q += 4;
4977 bo = 1;
4978 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004979#else
Benjamin Peterson29060642009-01-31 22:14:21 +00004980 if (bom == 0x0000FEFF) {
4981 q += 4;
4982 bo = 1;
4983 }
4984 else if (bom == 0xFFFE0000) {
4985 q += 4;
4986 bo = -1;
4987 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004988#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00004989 }
Walter Dörwald41980ca2007-08-16 21:55:45 +00004990 }
4991
4992 if (bo == -1) {
4993 /* force LE */
4994 iorder[0] = 0;
4995 iorder[1] = 1;
4996 iorder[2] = 2;
4997 iorder[3] = 3;
4998 }
4999 else if (bo == 1) {
5000 /* force BE */
5001 iorder[0] = 3;
5002 iorder[1] = 2;
5003 iorder[2] = 1;
5004 iorder[3] = 0;
5005 }
5006
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005007 /* This might be one to much, because of a BOM */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005008 unicode = PyUnicode_New((size+3)/4, 127);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005009 if (!unicode)
5010 return NULL;
5011 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005012 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005013 outpos = 0;
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005014
Walter Dörwald41980ca2007-08-16 21:55:45 +00005015 while (q < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005016 Py_UCS4 ch;
5017 /* remaining bytes at the end? (size should be divisible by 4) */
5018 if (e-q<4) {
5019 if (consumed)
5020 break;
5021 errmsg = "truncated data";
5022 startinpos = ((const char *)q)-starts;
5023 endinpos = ((const char *)e)-starts;
5024 goto utf32Error;
5025 /* The remaining input chars are ignored if the callback
5026 chooses to skip the input */
5027 }
5028 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
5029 (q[iorder[1]] << 8) | q[iorder[0]];
Walter Dörwald41980ca2007-08-16 21:55:45 +00005030
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 if (ch >= 0x110000)
5032 {
5033 errmsg = "codepoint not in range(0x110000)";
5034 startinpos = ((const char *)q)-starts;
5035 endinpos = startinpos+4;
5036 goto utf32Error;
5037 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005038 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5039 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005040 q += 4;
5041 continue;
5042 utf32Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005043 if (unicode_decode_call_errorhandler(
5044 errors, &errorHandler,
5045 "utf32", errmsg,
5046 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005047 &unicode, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005049 }
5050
5051 if (byteorder)
5052 *byteorder = bo;
5053
5054 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056
5057 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005058 if (PyUnicode_Resize(&unicode, outpos) < 0)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059 goto onError;
5060
5061 Py_XDECREF(errorHandler);
5062 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005063 return unicode_result(unicode);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005064
Benjamin Peterson29060642009-01-31 22:14:21 +00005065 onError:
Walter Dörwald41980ca2007-08-16 21:55:45 +00005066 Py_DECREF(unicode);
5067 Py_XDECREF(errorHandler);
5068 Py_XDECREF(exc);
5069 return NULL;
5070}
5071
5072PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005073_PyUnicode_EncodeUTF32(PyObject *str,
5074 const char *errors,
5075 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005076{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005077 int kind;
5078 void *data;
5079 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005080 PyObject *v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005081 unsigned char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005082 Py_ssize_t nsize, bytesize, i;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005083 /* Offsets from p for storing byte pairs in the right order. */
5084#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5085 int iorder[] = {0, 1, 2, 3};
5086#else
5087 int iorder[] = {3, 2, 1, 0};
5088#endif
5089
Benjamin Peterson29060642009-01-31 22:14:21 +00005090#define STORECHAR(CH) \
5091 do { \
5092 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5093 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5094 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5095 p[iorder[0]] = (CH) & 0xff; \
5096 p += 4; \
Walter Dörwald41980ca2007-08-16 21:55:45 +00005097 } while(0)
5098
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005099 if (!PyUnicode_Check(str)) {
5100 PyErr_BadArgument();
5101 return NULL;
5102 }
5103 if (PyUnicode_READY(str) < 0)
5104 return NULL;
5105 kind = PyUnicode_KIND(str);
5106 data = PyUnicode_DATA(str);
5107 len = PyUnicode_GET_LENGTH(str);
5108
5109 nsize = len + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005110 bytesize = nsize * 4;
5111 if (bytesize / 4 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005112 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005113 v = PyBytes_FromStringAndSize(NULL, bytesize);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005114 if (v == NULL)
5115 return NULL;
5116
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005117 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005118 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005119 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005120 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005121 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005122
5123 if (byteorder == -1) {
5124 /* force LE */
5125 iorder[0] = 0;
5126 iorder[1] = 1;
5127 iorder[2] = 2;
5128 iorder[3] = 3;
5129 }
5130 else if (byteorder == 1) {
5131 /* force BE */
5132 iorder[0] = 3;
5133 iorder[1] = 2;
5134 iorder[2] = 1;
5135 iorder[3] = 0;
5136 }
5137
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005138 for (i = 0; i < len; i++)
5139 STORECHAR(PyUnicode_READ(kind, data, i));
Guido van Rossum98297ee2007-11-06 21:34:58 +00005140
5141 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005142 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005143#undef STORECHAR
5144}
5145
Alexander Belopolsky40018472011-02-26 01:02:56 +00005146PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005147PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5148 Py_ssize_t size,
5149 const char *errors,
5150 int byteorder)
5151{
5152 PyObject *result;
5153 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5154 if (tmp == NULL)
5155 return NULL;
5156 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5157 Py_DECREF(tmp);
5158 return result;
5159}
5160
5161PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005162PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005163{
Victor Stinnerb960b342011-11-20 19:12:52 +01005164 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005165}
5166
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167/* --- UTF-16 Codec ------------------------------------------------------- */
5168
Tim Peters772747b2001-08-09 22:21:55 +00005169PyObject *
5170PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005171 Py_ssize_t size,
5172 const char *errors,
5173 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174{
Walter Dörwald69652032004-09-07 20:24:22 +00005175 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5176}
5177
Antoine Pitrouab868312009-01-10 15:40:25 +00005178/* Two masks for fast checking of whether a C 'long' may contain
5179 UTF16-encoded surrogate characters. This is an efficient heuristic,
5180 assuming that non-surrogate characters with a code point >= 0x8000 are
5181 rare in most input.
5182 FAST_CHAR_MASK is used when the input is in native byte ordering,
5183 SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
Benjamin Peterson29060642009-01-31 22:14:21 +00005184*/
Antoine Pitrouab868312009-01-10 15:40:25 +00005185#if (SIZEOF_LONG == 8)
5186# define FAST_CHAR_MASK 0x8000800080008000L
5187# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
5188#elif (SIZEOF_LONG == 4)
5189# define FAST_CHAR_MASK 0x80008000L
5190# define SWAPPED_FAST_CHAR_MASK 0x00800080L
5191#else
5192# error C 'long' size should be either 4 or 8!
5193#endif
5194
Walter Dörwald69652032004-09-07 20:24:22 +00005195PyObject *
5196PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005197 Py_ssize_t size,
5198 const char *errors,
5199 int *byteorder,
5200 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005201{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005202 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005203 Py_ssize_t startinpos;
5204 Py_ssize_t endinpos;
5205 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005206 PyObject *unicode;
Antoine Pitrouab868312009-01-10 15:40:25 +00005207 const unsigned char *q, *e, *aligned_end;
Tim Peters772747b2001-08-09 22:21:55 +00005208 int bo = 0; /* assume native ordering by default */
Antoine Pitrouab868312009-01-10 15:40:25 +00005209 int native_ordering = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005210 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00005211 /* Offsets from q for retrieving byte pairs in the right order. */
5212#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5213 int ihi = 1, ilo = 0;
5214#else
5215 int ihi = 0, ilo = 1;
5216#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005217 PyObject *errorHandler = NULL;
5218 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219
5220 /* Note: size will always be longer than the resulting Unicode
5221 character count */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005222 unicode = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 if (!unicode)
5224 return NULL;
5225 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005226 return unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005227 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
Tim Peters772747b2001-08-09 22:21:55 +00005229 q = (unsigned char *)s;
Antoine Pitrouab868312009-01-10 15:40:25 +00005230 e = q + size - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
5232 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005233 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005235 /* Check for BOM marks (U+FEFF) in the input and adjust current
5236 byte order setting accordingly. In native mode, the leading BOM
5237 mark is skipped, in all other modes, it is copied to the output
5238 stream as-is (giving a ZWNBSP character). */
5239 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00005240 if (size >= 2) {
Victor Stinner24729f32011-11-10 20:31:37 +01005241 const Py_UCS4 bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005242#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Benjamin Peterson29060642009-01-31 22:14:21 +00005243 if (bom == 0xFEFF) {
5244 q += 2;
5245 bo = -1;
5246 }
5247 else if (bom == 0xFFFE) {
5248 q += 2;
5249 bo = 1;
5250 }
Tim Petersced69f82003-09-16 20:30:58 +00005251#else
Benjamin Peterson29060642009-01-31 22:14:21 +00005252 if (bom == 0xFEFF) {
5253 q += 2;
5254 bo = 1;
5255 }
5256 else if (bom == 0xFFFE) {
5257 q += 2;
5258 bo = -1;
5259 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005260#endif
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263
Tim Peters772747b2001-08-09 22:21:55 +00005264 if (bo == -1) {
5265 /* force LE */
5266 ihi = 1;
5267 ilo = 0;
5268 }
5269 else if (bo == 1) {
5270 /* force BE */
5271 ihi = 0;
5272 ilo = 1;
5273 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005274#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5275 native_ordering = ilo < ihi;
5276#else
5277 native_ordering = ilo > ihi;
5278#endif
Tim Peters772747b2001-08-09 22:21:55 +00005279
Antoine Pitrouab868312009-01-10 15:40:25 +00005280 aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
Tim Peters772747b2001-08-09 22:21:55 +00005281 while (q < e) {
Victor Stinner24729f32011-11-10 20:31:37 +01005282 Py_UCS4 ch;
Antoine Pitrouab868312009-01-10 15:40:25 +00005283 /* First check for possible aligned read of a C 'long'. Unaligned
5284 reads are more expensive, better to defer to another iteration. */
5285 if (!((size_t) q & LONG_PTR_MASK)) {
5286 /* Fast path for runs of non-surrogate chars. */
5287 register const unsigned char *_q = q;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005288 int kind = PyUnicode_KIND(unicode);
5289 void *data = PyUnicode_DATA(unicode);
5290 while (_q < aligned_end) {
5291 unsigned long block = * (unsigned long *) _q;
5292 unsigned short *pblock = (unsigned short*)&block;
5293 Py_UCS4 maxch;
5294 if (native_ordering) {
5295 /* Can use buffer directly */
5296 if (block & FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005297 break;
Antoine Pitrouab868312009-01-10 15:40:25 +00005298 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005299 else {
5300 /* Need to byte-swap */
5301 unsigned char *_p = (unsigned char*)pblock;
5302 if (block & SWAPPED_FAST_CHAR_MASK)
Antoine Pitrouab868312009-01-10 15:40:25 +00005303 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005304 _p[0] = _q[1];
5305 _p[1] = _q[0];
5306 _p[2] = _q[3];
5307 _p[3] = _q[2];
Antoine Pitrouab868312009-01-10 15:40:25 +00005308#if (SIZEOF_LONG == 8)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005309 _p[4] = _q[5];
5310 _p[5] = _q[4];
5311 _p[6] = _q[7];
5312 _p[7] = _q[6];
Antoine Pitrouab868312009-01-10 15:40:25 +00005313#endif
Antoine Pitrouab868312009-01-10 15:40:25 +00005314 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005315 maxch = Py_MAX(pblock[0], pblock[1]);
5316#if SIZEOF_LONG == 8
5317 maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
5318#endif
5319 if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
5320 if (unicode_widen(&unicode, maxch) < 0)
5321 goto onError;
5322 kind = PyUnicode_KIND(unicode);
5323 data = PyUnicode_DATA(unicode);
5324 }
5325 PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
5326 PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
5327#if SIZEOF_LONG == 8
5328 PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
5329 PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
5330#endif
5331 _q += SIZEOF_LONG;
Antoine Pitrouab868312009-01-10 15:40:25 +00005332 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005333 q = _q;
5334 if (q >= e)
5335 break;
5336 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005337 ch = (q[ihi] << 8) | q[ilo];
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005338
Benjamin Peterson14339b62009-01-31 16:36:08 +00005339 q += 2;
Benjamin Peterson29060642009-01-31 22:14:21 +00005340
Victor Stinner551ac952011-11-29 22:58:13 +01005341 if (!Py_UNICODE_IS_SURROGATE(ch)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005342 if (unicode_putchar(&unicode, &outpos, ch) < 0)
5343 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005344 continue;
5345 }
5346
5347 /* UTF-16 code pair: */
5348 if (q > e) {
5349 errmsg = "unexpected end of data";
5350 startinpos = (((const char *)q) - 2) - starts;
5351 endinpos = ((const char *)e) + 1 - starts;
5352 goto utf16Error;
5353 }
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005354 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)) {
5355 Py_UCS4 ch2 = (q[ihi] << 8) | q[ilo];
Benjamin Peterson29060642009-01-31 22:14:21 +00005356 q += 2;
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005357 if (Py_UNICODE_IS_LOW_SURROGATE(ch2)) {
Victor Stinner62aa4d02011-11-09 00:03:45 +01005358 if (unicode_putchar(&unicode, &outpos,
Victor Stinner2e9cfad2011-11-20 18:40:27 +01005359 Py_UNICODE_JOIN_SURROGATES(ch, ch2)) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005360 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005361 continue;
5362 }
5363 else {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005364 errmsg = "illegal UTF-16 surrogate";
Benjamin Peterson29060642009-01-31 22:14:21 +00005365 startinpos = (((const char *)q)-4)-starts;
5366 endinpos = startinpos+2;
5367 goto utf16Error;
5368 }
5369
Benjamin Peterson14339b62009-01-31 16:36:08 +00005370 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 errmsg = "illegal encoding";
5372 startinpos = (((const char *)q)-2)-starts;
5373 endinpos = startinpos+2;
5374 /* Fall through to report the error */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005375
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 utf16Error:
Benjamin Peterson29060642009-01-31 22:14:21 +00005377 if (unicode_decode_call_errorhandler(
Antoine Pitrouab868312009-01-10 15:40:25 +00005378 errors,
5379 &errorHandler,
5380 "utf16", errmsg,
5381 &starts,
5382 (const char **)&e,
5383 &startinpos,
5384 &endinpos,
5385 &exc,
5386 (const char **)&q,
5387 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005388 &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00005389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005391 /* remaining byte at the end? (size should be even) */
5392 if (e == q) {
5393 if (!consumed) {
5394 errmsg = "truncated data";
5395 startinpos = ((const char *)q) - starts;
5396 endinpos = ((const char *)e) + 1 - starts;
Antoine Pitrouab868312009-01-10 15:40:25 +00005397 if (unicode_decode_call_errorhandler(
5398 errors,
5399 &errorHandler,
5400 "utf16", errmsg,
5401 &starts,
5402 (const char **)&e,
5403 &startinpos,
5404 &endinpos,
5405 &exc,
5406 (const char **)&q,
5407 &unicode,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005408 &outpos))
Antoine Pitrouab868312009-01-10 15:40:25 +00005409 goto onError;
5410 /* The remaining input chars are ignored if the callback
5411 chooses to skip the input */
5412 }
5413 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
5415 if (byteorder)
5416 *byteorder = bo;
5417
Walter Dörwald69652032004-09-07 20:24:22 +00005418 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005419 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005420
Guido van Rossumd57fd912000-03-10 22:53:23 +00005421 /* Adjust length */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005422 if (PyUnicode_Resize(&unicode, outpos) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 goto onError;
5424
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005425 Py_XDECREF(errorHandler);
5426 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005427 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Benjamin Peterson29060642009-01-31 22:14:21 +00005429 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005431 Py_XDECREF(errorHandler);
5432 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 return NULL;
5434}
5435
Antoine Pitrouab868312009-01-10 15:40:25 +00005436#undef FAST_CHAR_MASK
5437#undef SWAPPED_FAST_CHAR_MASK
5438
Tim Peters772747b2001-08-09 22:21:55 +00005439PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005440_PyUnicode_EncodeUTF16(PyObject *str,
5441 const char *errors,
5442 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 int kind;
5445 void *data;
5446 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005447 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00005448 unsigned char *p;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005449 Py_ssize_t nsize, bytesize;
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005450 Py_ssize_t i, pairs;
Tim Peters772747b2001-08-09 22:21:55 +00005451 /* Offsets from p for storing byte pairs in the right order. */
5452#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5453 int ihi = 1, ilo = 0;
5454#else
5455 int ihi = 0, ilo = 1;
5456#endif
5457
Benjamin Peterson29060642009-01-31 22:14:21 +00005458#define STORECHAR(CH) \
5459 do { \
5460 p[ihi] = ((CH) >> 8) & 0xff; \
5461 p[ilo] = (CH) & 0xff; \
5462 p += 2; \
Tim Peters772747b2001-08-09 22:21:55 +00005463 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005464
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005465 if (!PyUnicode_Check(str)) {
5466 PyErr_BadArgument();
5467 return NULL;
5468 }
5469 if (PyUnicode_READY(str) < 0)
5470 return NULL;
5471 kind = PyUnicode_KIND(str);
5472 data = PyUnicode_DATA(str);
5473 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005474
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005475 pairs = 0;
5476 if (kind == PyUnicode_4BYTE_KIND)
5477 for (i = 0; i < len; i++)
5478 if (PyUnicode_READ(kind, data, i) >= 0x10000)
5479 pairs++;
5480 /* 2 * (len + pairs + (byteorder == 0)) */
5481 if (len > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005482 return PyErr_NoMemory();
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005483 nsize = len + pairs + (byteorder == 0);
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005484 bytesize = nsize * 2;
5485 if (bytesize / 2 != nsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 return PyErr_NoMemory();
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005487 v = PyBytes_FromStringAndSize(NULL, bytesize);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 if (v == NULL)
5489 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005491 p = (unsigned char *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492 if (byteorder == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005494 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005495 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005496
5497 if (byteorder == -1) {
5498 /* force LE */
5499 ihi = 1;
5500 ilo = 0;
5501 }
5502 else if (byteorder == 1) {
5503 /* force BE */
5504 ihi = 0;
5505 ilo = 1;
5506 }
5507
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005508 for (i = 0; i < len; i++) {
5509 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5510 Py_UCS4 ch2 = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 if (ch >= 0x10000) {
Victor Stinner551ac952011-11-29 22:58:13 +01005512 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
5513 ch = Py_UNICODE_HIGH_SURROGATE(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00005514 }
Tim Peters772747b2001-08-09 22:21:55 +00005515 STORECHAR(ch);
5516 if (ch2)
5517 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005518 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005519
5520 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005521 return v;
Tim Peters772747b2001-08-09 22:21:55 +00005522#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523}
5524
Alexander Belopolsky40018472011-02-26 01:02:56 +00005525PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005526PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5527 Py_ssize_t size,
5528 const char *errors,
5529 int byteorder)
5530{
5531 PyObject *result;
5532 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5533 if (tmp == NULL)
5534 return NULL;
5535 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5536 Py_DECREF(tmp);
5537 return result;
5538}
5539
5540PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005541PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005543 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544}
5545
5546/* --- Unicode Escape Codec ----------------------------------------------- */
5547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005548/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5549 if all the escapes in the string make it still a valid ASCII string.
5550 Returns -1 if any escapes were found which cause the string to
5551 pop out of ASCII range. Otherwise returns the length of the
5552 required buffer to hold the string.
5553 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005554static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005555length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5556{
5557 const unsigned char *p = (const unsigned char *)s;
5558 const unsigned char *end = p + size;
5559 Py_ssize_t length = 0;
5560
5561 if (size < 0)
5562 return -1;
5563
5564 for (; p < end; ++p) {
5565 if (*p > 127) {
5566 /* Non-ASCII */
5567 return -1;
5568 }
5569 else if (*p != '\\') {
5570 /* Normal character */
5571 ++length;
5572 }
5573 else {
5574 /* Backslash-escape, check next char */
5575 ++p;
5576 /* Escape sequence reaches till end of string or
5577 non-ASCII follow-up. */
5578 if (p >= end || *p > 127)
5579 return -1;
5580 switch (*p) {
5581 case '\n':
5582 /* backslash + \n result in zero characters */
5583 break;
5584 case '\\': case '\'': case '\"':
5585 case 'b': case 'f': case 't':
5586 case 'n': case 'r': case 'v': case 'a':
5587 ++length;
5588 break;
5589 case '0': case '1': case '2': case '3':
5590 case '4': case '5': case '6': case '7':
5591 case 'x': case 'u': case 'U': case 'N':
5592 /* these do not guarantee ASCII characters */
5593 return -1;
5594 default:
5595 /* count the backslash + the other character */
5596 length += 2;
5597 }
5598 }
5599 }
5600 return length;
5601}
5602
Fredrik Lundh06d12682001-01-24 07:59:11 +00005603static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005604
Alexander Belopolsky40018472011-02-26 01:02:56 +00005605PyObject *
5606PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005607 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005608 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005609{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005610 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005611 Py_ssize_t startinpos;
5612 Py_ssize_t endinpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005613 int j;
Victor Stinner7931d9a2011-11-04 00:22:48 +01005614 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005616 char* message;
5617 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005618 PyObject *errorHandler = NULL;
5619 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005620 Py_ssize_t len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005621 Py_ssize_t i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005622
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005623 len = length_of_escaped_ascii_string(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005624
5625 /* After length_of_escaped_ascii_string() there are two alternatives,
5626 either the string is pure ASCII with named escapes like \n, etc.
5627 and we determined it's exact size (common case)
5628 or it contains \x, \u, ... escape sequences. then we create a
5629 legacy wchar string and resize it at the end of this function. */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005630 if (len >= 0) {
5631 v = PyUnicode_New(len, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005632 if (!v)
5633 goto onError;
5634 assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005635 }
5636 else {
5637 /* Escaped strings will always be longer than the resulting
5638 Unicode string, so we start with size here and then reduce the
5639 length after conversion to the true value.
5640 (but if the error callback returns a long replacement string
5641 we'll have to allocate more space) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005642 v = PyUnicode_New(size, 127);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005643 if (!v)
5644 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005645 len = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646 }
5647
Guido van Rossumd57fd912000-03-10 22:53:23 +00005648 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01005649 return v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005650 i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005652
Guido van Rossumd57fd912000-03-10 22:53:23 +00005653 while (s < end) {
5654 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005655 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005656 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005658 /* The only case in which i == ascii_length is a backslash
5659 followed by a newline. */
5660 assert(i <= len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005661
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 /* Non-escape characters are interpreted as Unicode ordinals */
5663 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005664 if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
5665 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 continue;
5667 }
5668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005669 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 /* \ - Escapes */
5671 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005672 c = *s++;
5673 if (s > end)
5674 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005675
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 /* The only case in which i == ascii_length is a backslash
5677 followed by a newline. */
5678 assert(i < len || (i == len && c == '\n'));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005680 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681
Benjamin Peterson29060642009-01-31 22:14:21 +00005682 /* \x escapes */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005683#define WRITECHAR(ch) \
5684 do { \
5685 if (unicode_putchar(&v, &i, ch) < 0) \
5686 goto onError; \
5687 }while(0)
5688
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005690 case '\\': WRITECHAR('\\'); break;
5691 case '\'': WRITECHAR('\''); break;
5692 case '\"': WRITECHAR('\"'); break;
5693 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005694 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005695 case 'f': WRITECHAR('\014'); break;
5696 case 't': WRITECHAR('\t'); break;
5697 case 'n': WRITECHAR('\n'); break;
5698 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005699 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005700 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005701 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005702 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703
Benjamin Peterson29060642009-01-31 22:14:21 +00005704 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705 case '0': case '1': case '2': case '3':
5706 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005707 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005708 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005709 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005710 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005711 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005712 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005713 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 break;
5715
Benjamin Peterson29060642009-01-31 22:14:21 +00005716 /* hex escapes */
5717 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005719 digits = 2;
5720 message = "truncated \\xXX escape";
5721 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
Benjamin Peterson29060642009-01-31 22:14:21 +00005723 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 digits = 4;
5726 message = "truncated \\uXXXX escape";
5727 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005730 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 digits = 8;
5732 message = "truncated \\UXXXXXXXX escape";
5733 hexescape:
5734 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005735 if (s+digits>end) {
5736 endinpos = size;
5737 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005738 errors, &errorHandler,
5739 "unicodeescape", "end of string in escape sequence",
5740 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005741 &v, &i))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005742 goto onError;
5743 goto nextByte;
5744 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005745 for (j = 0; j < digits; ++j) {
5746 c = (unsigned char) s[j];
David Malcolm96960882010-11-05 17:23:41 +00005747 if (!Py_ISXDIGIT(c)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005748 endinpos = (s+j+1)-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005749 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005750 errors, &errorHandler,
5751 "unicodeescape", message,
5752 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005753 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005754 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005755 len = PyUnicode_GET_LENGTH(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005756 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005757 }
5758 chr = (chr<<4) & ~0xF;
5759 if (c >= '0' && c <= '9')
5760 chr += c - '0';
5761 else if (c >= 'a' && c <= 'f')
5762 chr += 10 + c - 'a';
5763 else
5764 chr += 10 + c - 'A';
5765 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005766 s += j;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005767 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005768 /* _decoding_error will have already written into the
5769 target buffer. */
5770 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005771 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005772 /* when we get here, chr is a 32-bit unicode character */
Victor Stinner8faf8212011-12-08 22:14:11 +01005773 if (chr <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005774 WRITECHAR(chr);
Fredrik Lundhdf846752000-09-03 11:29:49 +00005775 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005776 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005777 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 errors, &errorHandler,
5779 "unicodeescape", "illegal Unicode character",
5780 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005781 &v, &i))
Fredrik Lundhdf846752000-09-03 11:29:49 +00005782 goto onError;
5783 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005784 break;
5785
Benjamin Peterson29060642009-01-31 22:14:21 +00005786 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005787 case 'N':
5788 message = "malformed \\N character escape";
5789 if (ucnhash_CAPI == NULL) {
5790 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005791 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5792 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005793 if (ucnhash_CAPI == NULL)
5794 goto ucnhashError;
5795 }
5796 if (*s == '{') {
5797 const char *start = s+1;
5798 /* look for the closing brace */
5799 while (*s != '}' && s < end)
5800 s++;
5801 if (s > start && s < end && *s == '}') {
5802 /* found a name. look it up in the unicode database */
5803 message = "unknown Unicode character name";
5804 s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005805 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005806 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005807 goto store;
5808 }
5809 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005810 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005811 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 errors, &errorHandler,
5813 "unicodeescape", message,
5814 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005815 &v, &i))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005816 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005817 break;
5818
5819 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005820 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005821 message = "\\ at end of string";
5822 s--;
5823 endinpos = s-starts;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005824 if (unicode_decode_call_errorhandler(
Benjamin Peterson29060642009-01-31 22:14:21 +00005825 errors, &errorHandler,
5826 "unicodeescape", message,
5827 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005828 &v, &i))
Walter Dörwald8c077222002-03-25 11:16:18 +00005829 goto onError;
5830 }
5831 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005832 WRITECHAR('\\');
5833 WRITECHAR(s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005834 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005835 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836 }
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 nextByte:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005838 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005839 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005840#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005841
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005842 if (PyUnicode_Resize(&v, i) < 0)
5843 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005844 Py_XDECREF(errorHandler);
5845 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01005846 return unicode_result(v);
Walter Dörwald8c077222002-03-25 11:16:18 +00005847
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005849 PyErr_SetString(
5850 PyExc_UnicodeError,
5851 "\\N escapes not supported (can't load unicodedata module)"
5852 );
Hye-Shik Chang4af5c8c2006-03-07 15:39:21 +00005853 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005854 Py_XDECREF(errorHandler);
5855 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005856 return NULL;
5857
Benjamin Peterson29060642009-01-31 22:14:21 +00005858 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00005859 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005860 Py_XDECREF(errorHandler);
5861 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 return NULL;
5863}
5864
5865/* Return a Unicode-Escape string version of the Unicode object.
5866
5867 If quotes is true, the string is enclosed in u"" or u'' quotes as
5868 appropriate.
5869
5870*/
5871
Alexander Belopolsky40018472011-02-26 01:02:56 +00005872PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005876 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005877 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 int kind;
5879 void *data;
5880 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005881
Thomas Wouters89f507f2006-12-13 04:49:30 +00005882 /* Initial allocation is based on the longest-possible unichr
5883 escape.
5884
5885 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5886 unichr, so in this case it's the longest unichr escape. In
5887 narrow (UTF-16) builds this is five chars per source unichr
5888 since there are two unichrs in the surrogate pair, so in narrow
5889 (UTF-16) builds it's not the longest unichr escape.
5890
5891 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5892 so in the narrow (UTF-16) build case it's the longest unichr
5893 escape.
5894 */
5895
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 if (!PyUnicode_Check(unicode)) {
5897 PyErr_BadArgument();
5898 return NULL;
5899 }
5900 if (PyUnicode_READY(unicode) < 0)
5901 return NULL;
5902 len = PyUnicode_GET_LENGTH(unicode);
5903 kind = PyUnicode_KIND(unicode);
5904 data = PyUnicode_DATA(unicode);
5905 switch(kind) {
5906 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5907 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5908 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5909 }
5910
5911 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005912 return PyBytes_FromStringAndSize(NULL, 0);
5913
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005914 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005916
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005917 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005918 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005919 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005920 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005921 if (repr == NULL)
5922 return NULL;
5923
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005924 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005926 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005927 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005928
Walter Dörwald79e913e2007-05-12 11:08:06 +00005929 /* Escape backslashes */
5930 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 *p++ = '\\';
5932 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005933 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005934 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005935
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005936 /* Map 21-bit characters to '\U00xxxxxx' */
5937 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005938 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005939 *p++ = '\\';
5940 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005941 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5942 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5943 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5944 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5945 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5946 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5947 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5948 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005949 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005950 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005951
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005953 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 *p++ = '\\';
5955 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005956 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5957 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5958 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5959 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005960 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005961
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005962 /* Map special whitespace to '\t', \n', '\r' */
5963 else if (ch == '\t') {
5964 *p++ = '\\';
5965 *p++ = 't';
5966 }
5967 else if (ch == '\n') {
5968 *p++ = '\\';
5969 *p++ = 'n';
5970 }
5971 else if (ch == '\r') {
5972 *p++ = '\\';
5973 *p++ = 'r';
5974 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005976 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005977 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005978 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005979 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005980 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5981 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005982 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005983
Guido van Rossumd57fd912000-03-10 22:53:23 +00005984 /* Copy everything else as-is */
5985 else
5986 *p++ = (char) ch;
5987 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005988
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005989 assert(p - PyBytes_AS_STRING(repr) > 0);
5990 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5991 return NULL;
5992 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993}
5994
Alexander Belopolsky40018472011-02-26 01:02:56 +00005995PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005996PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5997 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005999 PyObject *result;
6000 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6001 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006003 result = PyUnicode_AsUnicodeEscapeString(tmp);
6004 Py_DECREF(tmp);
6005 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006}
6007
6008/* --- Raw Unicode Escape Codec ------------------------------------------- */
6009
Alexander Belopolsky40018472011-02-26 01:02:56 +00006010PyObject *
6011PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006012 Py_ssize_t size,
6013 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006015 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006016 Py_ssize_t startinpos;
6017 Py_ssize_t endinpos;
6018 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006019 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 const char *end;
6021 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006022 PyObject *errorHandler = NULL;
6023 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006024
Guido van Rossumd57fd912000-03-10 22:53:23 +00006025 /* Escaped strings will always be longer than the resulting
6026 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006027 length after conversion to the true value. (But decoding error
6028 handler might have to resize the string) */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006029 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006031 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006033 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006034 outpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035 end = s + size;
6036 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006037 unsigned char c;
6038 Py_UCS4 x;
6039 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006040 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041
Benjamin Peterson29060642009-01-31 22:14:21 +00006042 /* Non-escape characters are interpreted as Unicode ordinals */
6043 if (*s != '\\') {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006044 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6045 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006046 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006047 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006048 startinpos = s-starts;
6049
6050 /* \u-escapes are only interpreted iff the number of leading
6051 backslashes if odd */
6052 bs = s;
6053 for (;s < end;) {
6054 if (*s != '\\')
6055 break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006056 if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
6057 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 }
6059 if (((s - bs) & 1) == 0 ||
6060 s >= end ||
6061 (*s != 'u' && *s != 'U')) {
6062 continue;
6063 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006064 outpos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006065 count = *s=='u' ? 4 : 8;
6066 s++;
6067
6068 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006069 for (x = 0, i = 0; i < count; ++i, ++s) {
6070 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006071 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006072 endinpos = s-starts;
6073 if (unicode_decode_call_errorhandler(
6074 errors, &errorHandler,
6075 "rawunicodeescape", "truncated \\uXXXX",
6076 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006077 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 goto onError;
6079 goto nextByte;
6080 }
6081 x = (x<<4) & ~0xF;
6082 if (c >= '0' && c <= '9')
6083 x += c - '0';
6084 else if (c >= 'a' && c <= 'f')
6085 x += 10 + c - 'a';
6086 else
6087 x += 10 + c - 'A';
6088 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006089 if (x <= MAX_UNICODE) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006090 if (unicode_putchar(&v, &outpos, x) < 0)
6091 goto onError;
Christian Heimesfe337bf2008-03-23 21:54:12 +00006092 } else {
6093 endinpos = s-starts;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006094 if (unicode_decode_call_errorhandler(
6095 errors, &errorHandler,
6096 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006097 &starts, &end, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006098 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006099 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006100 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006101 nextByte:
6102 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006104 if (PyUnicode_Resize(&v, outpos) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006105 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006106 Py_XDECREF(errorHandler);
6107 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006108 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00006109
Benjamin Peterson29060642009-01-31 22:14:21 +00006110 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006111 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006112 Py_XDECREF(errorHandler);
6113 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 return NULL;
6115}
6116
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006117
Alexander Belopolsky40018472011-02-26 01:02:56 +00006118PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006121 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 char *p;
6123 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006124 Py_ssize_t expandsize, pos;
6125 int kind;
6126 void *data;
6127 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 if (!PyUnicode_Check(unicode)) {
6130 PyErr_BadArgument();
6131 return NULL;
6132 }
6133 if (PyUnicode_READY(unicode) < 0)
6134 return NULL;
6135 kind = PyUnicode_KIND(unicode);
6136 data = PyUnicode_DATA(unicode);
6137 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006138 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6139 bytes, and 1 byte characters 4. */
6140 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006141
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006142 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006143 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006144
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006145 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 if (repr == NULL)
6147 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006149 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006151 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006152 for (pos = 0; pos < len; pos++) {
6153 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 /* Map 32-bit characters to '\Uxxxxxxxx' */
6155 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006156 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006157 *p++ = '\\';
6158 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006159 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6160 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6161 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6162 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6163 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6164 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6165 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6166 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006167 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170 *p++ = '\\';
6171 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006172 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6173 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6174 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6175 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006177 /* Copy everything else as-is */
6178 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179 *p++ = (char) ch;
6180 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006181
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006182 assert(p > q);
6183 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006184 return NULL;
6185 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186}
6187
Alexander Belopolsky40018472011-02-26 01:02:56 +00006188PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6190 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006192 PyObject *result;
6193 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6194 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006195 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6197 Py_DECREF(tmp);
6198 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199}
6200
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006201/* --- Unicode Internal Codec ------------------------------------------- */
6202
Alexander Belopolsky40018472011-02-26 01:02:56 +00006203PyObject *
6204_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006205 Py_ssize_t size,
6206 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006207{
6208 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006209 Py_ssize_t startinpos;
6210 Py_ssize_t endinpos;
6211 Py_ssize_t outpos;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006212 PyObject *v;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006213 const char *end;
6214 const char *reason;
6215 PyObject *errorHandler = NULL;
6216 PyObject *exc = NULL;
6217
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006218 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006219 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006220 1))
6221 return NULL;
6222
Thomas Wouters89f507f2006-12-13 04:49:30 +00006223 /* XXX overflow detection missing */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006224 v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006225 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006226 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006227 if (PyUnicode_GET_LENGTH(v) == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006228 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006229 outpos = 0;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006230 end = s + size;
6231
6232 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006233 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006234 Py_UCS4 ch;
6235 /* We copy the raw representation one byte at a time because the
6236 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006237 ((char *) &uch)[0] = s[0];
6238 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006239#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006240 ((char *) &uch)[2] = s[2];
6241 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006242#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006243 ch = uch;
6244
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245 /* We have to sanity check the raw data, otherwise doom looms for
6246 some malformed UCS-4 data. */
6247 if (
Benjamin Peterson29060642009-01-31 22:14:21 +00006248#ifdef Py_UNICODE_WIDE
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006249 ch > 0x10ffff ||
Benjamin Peterson29060642009-01-31 22:14:21 +00006250#endif
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006251 end-s < Py_UNICODE_SIZE
6252 )
Benjamin Peterson29060642009-01-31 22:14:21 +00006253 {
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006254 startinpos = s - starts;
6255 if (end-s < Py_UNICODE_SIZE) {
6256 endinpos = end-starts;
6257 reason = "truncated input";
6258 }
6259 else {
6260 endinpos = s - starts + Py_UNICODE_SIZE;
6261 reason = "illegal code point (> 0x10FFFF)";
6262 }
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 if (unicode_decode_call_errorhandler(
6264 errors, &errorHandler,
6265 "unicode_internal", reason,
Walter Dörwalde78178e2007-07-30 13:31:40 +00006266 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006267 &v, &outpos))
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268 goto onError;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006269 continue;
6270 }
6271
6272 s += Py_UNICODE_SIZE;
6273#ifndef Py_UNICODE_WIDE
Victor Stinner551ac952011-11-29 22:58:13 +01006274 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006275 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006276 Py_UNICODE uch2;
6277 ((char *) &uch2)[0] = s[0];
6278 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006279 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006280 {
Victor Stinner551ac952011-11-29 22:58:13 +01006281 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006282 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006283 }
6284 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006285#endif
6286
6287 if (unicode_putchar(&v, &outpos, ch) < 0)
6288 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006289 }
6290
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006291 if (PyUnicode_Resize(&v, outpos) < 0)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006292 goto onError;
6293 Py_XDECREF(errorHandler);
6294 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006295 return unicode_result(v);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006296
Benjamin Peterson29060642009-01-31 22:14:21 +00006297 onError:
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006298 Py_XDECREF(v);
6299 Py_XDECREF(errorHandler);
6300 Py_XDECREF(exc);
6301 return NULL;
6302}
6303
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304/* --- Latin-1 Codec ------------------------------------------------------ */
6305
Alexander Belopolsky40018472011-02-26 01:02:56 +00006306PyObject *
6307PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006308 Py_ssize_t size,
6309 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006312 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313}
6314
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006315/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006316static void
6317make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006318 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006319 PyObject *unicode,
6320 Py_ssize_t startpos, Py_ssize_t endpos,
6321 const char *reason)
6322{
6323 if (*exceptionObject == NULL) {
6324 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006325 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006326 encoding, unicode, startpos, endpos, reason);
6327 }
6328 else {
6329 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6330 goto onError;
6331 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6332 goto onError;
6333 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6334 goto onError;
6335 return;
6336 onError:
6337 Py_DECREF(*exceptionObject);
6338 *exceptionObject = NULL;
6339 }
6340}
6341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006343static void
6344raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006345 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006346 PyObject *unicode,
6347 Py_ssize_t startpos, Py_ssize_t endpos,
6348 const char *reason)
6349{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006350 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006351 encoding, unicode, startpos, endpos, reason);
6352 if (*exceptionObject != NULL)
6353 PyCodec_StrictErrors(*exceptionObject);
6354}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006355
6356/* error handling callback helper:
6357 build arguments, call the callback and check the arguments,
6358 put the result into newpos and return the replacement string, which
6359 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006360static PyObject *
6361unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006362 PyObject **errorHandler,
6363 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006364 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006365 Py_ssize_t startpos, Py_ssize_t endpos,
6366 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006368 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006369 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006370 PyObject *restuple;
6371 PyObject *resunicode;
6372
6373 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006374 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006376 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006377 }
6378
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 if (PyUnicode_READY(unicode) < 0)
6380 return NULL;
6381 len = PyUnicode_GET_LENGTH(unicode);
6382
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006383 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006384 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006387
6388 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006389 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006390 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006392 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006393 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 Py_DECREF(restuple);
6395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006396 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006397 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006398 &resunicode, newpos)) {
6399 Py_DECREF(restuple);
6400 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006402 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6403 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6404 Py_DECREF(restuple);
6405 return NULL;
6406 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006408 *newpos = len + *newpos;
6409 if (*newpos<0 || *newpos>len) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006410 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6411 Py_DECREF(restuple);
6412 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006413 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006414 Py_INCREF(resunicode);
6415 Py_DECREF(restuple);
6416 return resunicode;
6417}
6418
Alexander Belopolsky40018472011-02-26 01:02:56 +00006419static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006420unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006421 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006422 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006424 /* input state */
6425 Py_ssize_t pos=0, size;
6426 int kind;
6427 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006428 /* output object */
6429 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 /* pointer into the output */
6431 char *str;
6432 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006433 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006434 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6435 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 PyObject *errorHandler = NULL;
6437 PyObject *exc = NULL;
6438 /* the following variable is used for caching string comparisons
6439 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6440 int known_errorHandler = -1;
6441
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 if (PyUnicode_READY(unicode) < 0)
6443 return NULL;
6444 size = PyUnicode_GET_LENGTH(unicode);
6445 kind = PyUnicode_KIND(unicode);
6446 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006447 /* allocate enough for a simple encoding without
6448 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006449 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006450 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006451 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006453 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006454 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 ressize = size;
6456
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006457 while (pos < size) {
6458 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006459
Benjamin Peterson29060642009-01-31 22:14:21 +00006460 /* can we encode this? */
6461 if (c<limit) {
6462 /* no overflow check, because we know that the space is enough */
6463 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006464 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006465 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 Py_ssize_t requiredsize;
6468 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006469 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006471 Py_ssize_t collstart = pos;
6472 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 /* find all unecodable characters */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006474 while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 ++collend;
6476 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6477 if (known_errorHandler==-1) {
6478 if ((errors==NULL) || (!strcmp(errors, "strict")))
6479 known_errorHandler = 1;
6480 else if (!strcmp(errors, "replace"))
6481 known_errorHandler = 2;
6482 else if (!strcmp(errors, "ignore"))
6483 known_errorHandler = 3;
6484 else if (!strcmp(errors, "xmlcharrefreplace"))
6485 known_errorHandler = 4;
6486 else
6487 known_errorHandler = 0;
6488 }
6489 switch (known_errorHandler) {
6490 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006491 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 goto onError;
6493 case 2: /* replace */
6494 while (collstart++<collend)
6495 *str++ = '?'; /* fall through */
6496 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006497 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006498 break;
6499 case 4: /* xmlcharrefreplace */
6500 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006501 /* determine replacement size */
6502 for (i = collstart, repsize = 0; i < collend; ++i) {
6503 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6504 if (ch < 10)
Benjamin Peterson29060642009-01-31 22:14:21 +00006505 repsize += 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006506 else if (ch < 100)
Benjamin Peterson29060642009-01-31 22:14:21 +00006507 repsize += 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 else if (ch < 1000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006509 repsize += 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 else if (ch < 10000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 repsize += 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 else if (ch < 100000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 repsize += 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 else if (ch < 1000000)
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 repsize += 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006516 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006517 assert(ch <= MAX_UNICODE);
Benjamin Peterson29060642009-01-31 22:14:21 +00006518 repsize += 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006519 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006521 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006522 if (requiredsize > ressize) {
6523 if (requiredsize<2*ressize)
6524 requiredsize = 2*ressize;
6525 if (_PyBytes_Resize(&res, requiredsize))
6526 goto onError;
6527 str = PyBytes_AS_STRING(res) + respos;
6528 ressize = requiredsize;
6529 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006530 /* generate replacement */
6531 for (i = collstart; i < collend; ++i) {
6532 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006534 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006535 break;
6536 default:
6537 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006538 encoding, reason, unicode, &exc,
6539 collstart, collend, &newpos);
6540 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6541 PyUnicode_READY(repunicode) < 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00006542 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006543 if (PyBytes_Check(repunicode)) {
6544 /* Directly copy bytes result to output. */
6545 repsize = PyBytes_Size(repunicode);
6546 if (repsize > 1) {
6547 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006548 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006549 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6550 Py_DECREF(repunicode);
6551 goto onError;
6552 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006553 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006554 ressize += repsize-1;
6555 }
6556 memcpy(str, PyBytes_AsString(repunicode), repsize);
6557 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006558 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006559 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006560 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006561 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 /* need more space? (at least enough for what we
6563 have+the replacement+the rest of the string, so
6564 we won't have to check space for encodable characters) */
6565 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006566 repsize = PyUnicode_GET_LENGTH(repunicode);
6567 requiredsize = respos+repsize+(size-collend);
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 if (requiredsize > ressize) {
6569 if (requiredsize<2*ressize)
6570 requiredsize = 2*ressize;
6571 if (_PyBytes_Resize(&res, requiredsize)) {
6572 Py_DECREF(repunicode);
6573 goto onError;
6574 }
6575 str = PyBytes_AS_STRING(res) + respos;
6576 ressize = requiredsize;
6577 }
6578 /* check if there is anything unencodable in the replacement
6579 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006580 for (i = 0; repsize-->0; ++i, ++str) {
6581 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006582 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006583 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006584 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006585 Py_DECREF(repunicode);
6586 goto onError;
6587 }
6588 *str = (char)c;
6589 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006590 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006591 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006592 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006593 }
6594 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006595 /* Resize if we allocated to much */
6596 size = str - PyBytes_AS_STRING(res);
6597 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006598 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 if (_PyBytes_Resize(&res, size) < 0)
6600 goto onError;
6601 }
6602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006603 Py_XDECREF(errorHandler);
6604 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006605 return res;
6606
6607 onError:
6608 Py_XDECREF(res);
6609 Py_XDECREF(errorHandler);
6610 Py_XDECREF(exc);
6611 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006612}
6613
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006614/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006615PyObject *
6616PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006617 Py_ssize_t size,
6618 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006620 PyObject *result;
6621 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6622 if (unicode == NULL)
6623 return NULL;
6624 result = unicode_encode_ucs1(unicode, errors, 256);
6625 Py_DECREF(unicode);
6626 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627}
6628
Alexander Belopolsky40018472011-02-26 01:02:56 +00006629PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006630_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631{
6632 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 PyErr_BadArgument();
6634 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006636 if (PyUnicode_READY(unicode) == -1)
6637 return NULL;
6638 /* Fast path: if it is a one-byte string, construct
6639 bytes object directly. */
6640 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6641 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6642 PyUnicode_GET_LENGTH(unicode));
6643 /* Non-Latin-1 characters present. Defer to above function to
6644 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006645 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006646}
6647
6648PyObject*
6649PyUnicode_AsLatin1String(PyObject *unicode)
6650{
6651 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652}
6653
6654/* --- 7-bit ASCII Codec -------------------------------------------------- */
6655
Alexander Belopolsky40018472011-02-26 01:02:56 +00006656PyObject *
6657PyUnicode_DecodeASCII(const char *s,
6658 Py_ssize_t size,
6659 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006661 const char *starts = s;
Victor Stinner7931d9a2011-11-04 00:22:48 +01006662 PyObject *v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006663 int kind;
6664 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006665 Py_ssize_t startinpos;
6666 Py_ssize_t endinpos;
6667 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 const char *e;
Victor Stinner702c7342011-10-05 13:50:52 +02006669 int has_error;
6670 const unsigned char *p = (const unsigned char *)s;
6671 const unsigned char *end = p + size;
6672 const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006673 PyObject *errorHandler = NULL;
6674 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006675
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006676 if (size == 0) {
6677 Py_INCREF(unicode_empty);
6678 return unicode_empty;
6679 }
6680
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006682 if (size == 1 && (unsigned char)s[0] < 128)
6683 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006684
Victor Stinner702c7342011-10-05 13:50:52 +02006685 has_error = 0;
6686 while (p < end && !has_error) {
6687 /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6688 an explanation. */
6689 if (!((size_t) p & LONG_PTR_MASK)) {
6690 /* Help register allocation */
6691 register const unsigned char *_p = p;
6692 while (_p < aligned_end) {
6693 unsigned long value = *(unsigned long *) _p;
6694 if (value & ASCII_CHAR_MASK) {
6695 has_error = 1;
6696 break;
6697 }
6698 _p += SIZEOF_LONG;
6699 }
6700 if (_p == end)
6701 break;
6702 if (has_error)
6703 break;
6704 p = _p;
6705 }
6706 if (*p & 0x80) {
6707 has_error = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006708 break;
Victor Stinner702c7342011-10-05 13:50:52 +02006709 }
6710 else {
6711 ++p;
6712 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006713 }
Victor Stinner702c7342011-10-05 13:50:52 +02006714 if (!has_error)
6715 return unicode_fromascii((const unsigned char *)s, size);
Tim Petersced69f82003-09-16 20:30:58 +00006716
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006717 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006718 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01006721 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006722 kind = PyUnicode_KIND(v);
6723 data = PyUnicode_DATA(v);
6724 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 e = s + size;
6726 while (s < e) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 register unsigned char c = (unsigned char)*s;
6728 if (c < 128) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006729 PyUnicode_WRITE(kind, data, outpos++, c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 ++s;
6731 }
6732 else {
6733 startinpos = s-starts;
6734 endinpos = startinpos + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 if (unicode_decode_call_errorhandler(
6736 errors, &errorHandler,
6737 "ascii", "ordinal not in range(128)",
6738 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006739 &v, &outpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 goto onError;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006741 kind = PyUnicode_KIND(v);
6742 data = PyUnicode_DATA(v);
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006745 if (PyUnicode_Resize(&v, outpos) < 0)
6746 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 Py_XDECREF(errorHandler);
6748 Py_XDECREF(exc);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02006749 assert(_PyUnicode_CheckConsistency(v, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +01006750 return v;
Tim Petersced69f82003-09-16 20:30:58 +00006751
Benjamin Peterson29060642009-01-31 22:14:21 +00006752 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 Py_XDECREF(errorHandler);
6755 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756 return NULL;
6757}
6758
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006760PyObject *
6761PyUnicode_EncodeASCII(const Py_UNICODE *p,
6762 Py_ssize_t size,
6763 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006765 PyObject *result;
6766 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6767 if (unicode == NULL)
6768 return NULL;
6769 result = unicode_encode_ucs1(unicode, errors, 128);
6770 Py_DECREF(unicode);
6771 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006772}
6773
Alexander Belopolsky40018472011-02-26 01:02:56 +00006774PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776{
6777 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006778 PyErr_BadArgument();
6779 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006781 if (PyUnicode_READY(unicode) == -1)
6782 return NULL;
6783 /* Fast path: if it is an ASCII-only string, construct bytes object
6784 directly. Else defer to above function to raise the exception. */
6785 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6786 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6787 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006789}
6790
6791PyObject *
6792PyUnicode_AsASCIIString(PyObject *unicode)
6793{
6794 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006795}
6796
Victor Stinner99b95382011-07-04 14:23:54 +02006797#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006798
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006799/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006800
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006801#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006802#define NEED_RETRY
6803#endif
6804
Victor Stinner3a50e702011-10-18 21:21:00 +02006805#ifndef WC_ERR_INVALID_CHARS
6806# define WC_ERR_INVALID_CHARS 0x0080
6807#endif
6808
6809static char*
6810code_page_name(UINT code_page, PyObject **obj)
6811{
6812 *obj = NULL;
6813 if (code_page == CP_ACP)
6814 return "mbcs";
6815 if (code_page == CP_UTF7)
6816 return "CP_UTF7";
6817 if (code_page == CP_UTF8)
6818 return "CP_UTF8";
6819
6820 *obj = PyBytes_FromFormat("cp%u", code_page);
6821 if (*obj == NULL)
6822 return NULL;
6823 return PyBytes_AS_STRING(*obj);
6824}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006825
Alexander Belopolsky40018472011-02-26 01:02:56 +00006826static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006827is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828{
6829 const char *curr = s + offset;
Victor Stinner3a50e702011-10-18 21:21:00 +02006830 const char *prev;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 if (!IsDBCSLeadByteEx(code_page, *curr))
6833 return 0;
6834
6835 prev = CharPrevExA(code_page, s, curr, 0);
6836 if (prev == curr)
6837 return 1;
6838 /* FIXME: This code is limited to "true" double-byte encodings,
6839 as it assumes an incomplete character consists of a single
6840 byte. */
6841 if (curr - prev == 2)
6842 return 1;
6843 if (!IsDBCSLeadByteEx(code_page, *prev))
6844 return 1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006845 return 0;
6846}
6847
Victor Stinner3a50e702011-10-18 21:21:00 +02006848static DWORD
6849decode_code_page_flags(UINT code_page)
6850{
6851 if (code_page == CP_UTF7) {
6852 /* The CP_UTF7 decoder only supports flags=0 */
6853 return 0;
6854 }
6855 else
6856 return MB_ERR_INVALID_CHARS;
6857}
6858
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006859/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 * Decode a byte string from a Windows code page into unicode object in strict
6861 * mode.
6862 *
6863 * Returns consumed size if succeed, returns -2 on decode error, or raise a
6864 * WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006865 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006866static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006867decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006868 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006869 const char *in,
6870 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006871{
Victor Stinner3a50e702011-10-18 21:21:00 +02006872 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006873 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006874 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006875
6876 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006877 assert(insize > 0);
6878 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6879 if (outsize <= 0)
6880 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881
6882 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 /* Create unicode object */
Victor Stinner76a31a62011-11-04 00:05:13 +01006884 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 if (*v == NULL)
6886 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006887 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006888 }
6889 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006890 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006891 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 if (PyUnicode_Resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006894 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006895 }
6896
6897 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006898 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6899 if (outsize <= 0)
6900 goto error;
6901 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006902
Victor Stinner3a50e702011-10-18 21:21:00 +02006903error:
6904 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6905 return -2;
6906 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006907 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006908}
6909
Victor Stinner3a50e702011-10-18 21:21:00 +02006910/*
6911 * Decode a byte string from a code page into unicode object with an error
6912 * handler.
6913 *
6914 * Returns consumed size if succeed, or raise a WindowsError or
6915 * UnicodeDecodeError exception and returns -1 on error.
6916 */
6917static int
6918decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006919 PyObject **v,
6920 const char *in, const int size,
Victor Stinner3a50e702011-10-18 21:21:00 +02006921 const char *errors)
6922{
6923 const char *startin = in;
6924 const char *endin = in + size;
6925 const DWORD flags = decode_code_page_flags(code_page);
6926 /* Ideally, we should get reason from FormatMessage. This is the Windows
6927 2000 English version of the message. */
6928 const char *reason = "No mapping for the Unicode character exists "
6929 "in the target code page.";
6930 /* each step cannot decode more than 1 character, but a character can be
6931 represented as a surrogate pair */
6932 wchar_t buffer[2], *startout, *out;
6933 int insize, outsize;
6934 PyObject *errorHandler = NULL;
6935 PyObject *exc = NULL;
6936 PyObject *encoding_obj = NULL;
6937 char *encoding;
6938 DWORD err;
6939 int ret = -1;
6940
6941 assert(size > 0);
6942
6943 encoding = code_page_name(code_page, &encoding_obj);
6944 if (encoding == NULL)
6945 return -1;
6946
6947 if (errors == NULL || strcmp(errors, "strict") == 0) {
6948 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6949 UnicodeDecodeError. */
6950 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6951 if (exc != NULL) {
6952 PyCodec_StrictErrors(exc);
6953 Py_CLEAR(exc);
6954 }
6955 goto error;
6956 }
6957
6958 if (*v == NULL) {
6959 /* Create unicode object */
6960 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6961 PyErr_NoMemory();
6962 goto error;
6963 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006964 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006965 if (*v == NULL)
6966 goto error;
6967 startout = PyUnicode_AS_UNICODE(*v);
6968 }
6969 else {
6970 /* Extend unicode object */
6971 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6972 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6973 PyErr_NoMemory();
6974 goto error;
6975 }
Victor Stinner76a31a62011-11-04 00:05:13 +01006976 if (PyUnicode_Resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006977 goto error;
6978 startout = PyUnicode_AS_UNICODE(*v) + n;
6979 }
6980
6981 /* Decode the byte string character per character */
6982 out = startout;
6983 while (in < endin)
6984 {
6985 /* Decode a character */
6986 insize = 1;
6987 do
6988 {
6989 outsize = MultiByteToWideChar(code_page, flags,
6990 in, insize,
6991 buffer, Py_ARRAY_LENGTH(buffer));
6992 if (outsize > 0)
6993 break;
6994 err = GetLastError();
6995 if (err != ERROR_NO_UNICODE_TRANSLATION
6996 && err != ERROR_INSUFFICIENT_BUFFER)
6997 {
6998 PyErr_SetFromWindowsErr(0);
6999 goto error;
7000 }
7001 insize++;
7002 }
7003 /* 4=maximum length of a UTF-8 sequence */
7004 while (insize <= 4 && (in + insize) <= endin);
7005
7006 if (outsize <= 0) {
7007 Py_ssize_t startinpos, endinpos, outpos;
7008
7009 startinpos = in - startin;
7010 endinpos = startinpos + 1;
7011 outpos = out - PyUnicode_AS_UNICODE(*v);
7012 if (unicode_decode_call_errorhandler(
7013 errors, &errorHandler,
7014 encoding, reason,
7015 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01007016 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007017 {
7018 goto error;
7019 }
Victor Stinner596a6c42011-11-09 00:02:18 +01007020 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007021 }
7022 else {
7023 in += insize;
7024 memcpy(out, buffer, outsize * sizeof(wchar_t));
7025 out += outsize;
7026 }
7027 }
7028
7029 /* write a NUL character at the end */
7030 *out = 0;
7031
7032 /* Extend unicode object */
7033 outsize = out - startout;
7034 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner76a31a62011-11-04 00:05:13 +01007035 if (PyUnicode_Resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007036 goto error;
Victor Stinner76a31a62011-11-04 00:05:13 +01007037 ret = size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007038
7039error:
7040 Py_XDECREF(encoding_obj);
7041 Py_XDECREF(errorHandler);
7042 Py_XDECREF(exc);
7043 return ret;
7044}
7045
Victor Stinner3a50e702011-10-18 21:21:00 +02007046static PyObject *
7047decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007048 const char *s, Py_ssize_t size,
7049 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007050{
Victor Stinner76a31a62011-11-04 00:05:13 +01007051 PyObject *v = NULL;
7052 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007053
Victor Stinner3a50e702011-10-18 21:21:00 +02007054 if (code_page < 0) {
7055 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7056 return NULL;
7057 }
7058
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007059 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007060 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007061
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 do
7063 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007064#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007065 if (size > INT_MAX) {
7066 chunk_size = INT_MAX;
7067 final = 0;
7068 done = 0;
7069 }
7070 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007071#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007072 {
7073 chunk_size = (int)size;
7074 final = (consumed == NULL);
7075 done = 1;
7076 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007077
Victor Stinner76a31a62011-11-04 00:05:13 +01007078 /* Skip trailing lead-byte unless 'final' is set */
7079 if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7080 --chunk_size;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007081
Victor Stinner76a31a62011-11-04 00:05:13 +01007082 if (chunk_size == 0 && done) {
7083 if (v != NULL)
7084 break;
7085 Py_INCREF(unicode_empty);
7086 return unicode_empty;
7087 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007088
Victor Stinner76a31a62011-11-04 00:05:13 +01007089
7090 converted = decode_code_page_strict(code_page, &v,
7091 s, chunk_size);
7092 if (converted == -2)
7093 converted = decode_code_page_errors(code_page, &v,
7094 s, chunk_size,
7095 errors);
7096 assert(converted != 0);
7097
7098 if (converted < 0) {
7099 Py_XDECREF(v);
7100 return NULL;
7101 }
7102
7103 if (consumed)
7104 *consumed += converted;
7105
7106 s += converted;
7107 size -= converted;
7108 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007109
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007110 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111}
7112
Alexander Belopolsky40018472011-02-26 01:02:56 +00007113PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007114PyUnicode_DecodeCodePageStateful(int code_page,
7115 const char *s,
7116 Py_ssize_t size,
7117 const char *errors,
7118 Py_ssize_t *consumed)
7119{
7120 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7121}
7122
7123PyObject *
7124PyUnicode_DecodeMBCSStateful(const char *s,
7125 Py_ssize_t size,
7126 const char *errors,
7127 Py_ssize_t *consumed)
7128{
7129 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7130}
7131
7132PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007133PyUnicode_DecodeMBCS(const char *s,
7134 Py_ssize_t size,
7135 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007136{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7138}
7139
Victor Stinner3a50e702011-10-18 21:21:00 +02007140static DWORD
7141encode_code_page_flags(UINT code_page, const char *errors)
7142{
7143 if (code_page == CP_UTF8) {
7144 if (winver.dwMajorVersion >= 6)
7145 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7146 and later */
7147 return WC_ERR_INVALID_CHARS;
7148 else
7149 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7150 return 0;
7151 }
7152 else if (code_page == CP_UTF7) {
7153 /* CP_UTF7 only supports flags=0 */
7154 return 0;
7155 }
7156 else {
7157 if (errors != NULL && strcmp(errors, "replace") == 0)
7158 return 0;
7159 else
7160 return WC_NO_BEST_FIT_CHARS;
7161 }
7162}
7163
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007164/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 * Encode a Unicode string to a Windows code page into a byte string in strict
7166 * mode.
7167 *
7168 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7169 * a WindowsError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007170 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007171static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007172encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007173 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007175{
Victor Stinner554f3f02010-06-16 23:33:54 +00007176 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007177 BOOL *pusedDefaultChar = &usedDefaultChar;
7178 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007179 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007180 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007181 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 const DWORD flags = encode_code_page_flags(code_page, NULL);
7183 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007184 /* Create a substring so that we can get the UTF-16 representation
7185 of just the slice under consideration. */
7186 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187
Martin v. Löwis3d325192011-11-04 18:23:06 +01007188 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007189
Victor Stinner3a50e702011-10-18 21:21:00 +02007190 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007191 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007193 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007194
Victor Stinner2fc507f2011-11-04 20:06:39 +01007195 substring = PyUnicode_Substring(unicode, offset, offset+len);
7196 if (substring == NULL)
7197 return -1;
7198 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7199 if (p == NULL) {
7200 Py_DECREF(substring);
7201 return -1;
7202 }
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007204 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 outsize = WideCharToMultiByte(code_page, flags,
7206 p, size,
7207 NULL, 0,
7208 NULL, pusedDefaultChar);
7209 if (outsize <= 0)
7210 goto error;
7211 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 if (pusedDefaultChar && *pusedDefaultChar) {
7213 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007215 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007216
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007220 if (*outbytes == NULL) {
7221 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007223 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007224 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007225 }
7226 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007228 const Py_ssize_t n = PyBytes_Size(*outbytes);
7229 if (outsize > PY_SSIZE_T_MAX - n) {
7230 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007231 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007234 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7235 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007237 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007239 }
7240
7241 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007242 outsize = WideCharToMultiByte(code_page, flags,
7243 p, size,
7244 out, outsize,
7245 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007246 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007247 if (outsize <= 0)
7248 goto error;
7249 if (pusedDefaultChar && *pusedDefaultChar)
7250 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007251 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007252
Victor Stinner3a50e702011-10-18 21:21:00 +02007253error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007254 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7256 return -2;
7257 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007258 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007259}
7260
Victor Stinner3a50e702011-10-18 21:21:00 +02007261/*
7262 * Encode a Unicode string to a Windows code page into a byte string using a
7263 * error handler.
7264 *
7265 * Returns consumed characters if succeed, or raise a WindowsError and returns
7266 * -1 on other error.
7267 */
7268static int
7269encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007270 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007271 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007272{
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007274 Py_ssize_t pos = unicode_offset;
7275 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 /* Ideally, we should get reason from FormatMessage. This is the Windows
7277 2000 English version of the message. */
7278 const char *reason = "invalid character";
7279 /* 4=maximum length of a UTF-8 sequence */
7280 char buffer[4];
7281 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7282 Py_ssize_t outsize;
7283 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007284 PyObject *errorHandler = NULL;
7285 PyObject *exc = NULL;
7286 PyObject *encoding_obj = NULL;
7287 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007288 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 PyObject *rep;
7290 int ret = -1;
7291
7292 assert(insize > 0);
7293
7294 encoding = code_page_name(code_page, &encoding_obj);
7295 if (encoding == NULL)
7296 return -1;
7297
7298 if (errors == NULL || strcmp(errors, "strict") == 0) {
7299 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7300 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007301 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007302 if (exc != NULL) {
7303 PyCodec_StrictErrors(exc);
7304 Py_DECREF(exc);
7305 }
7306 Py_XDECREF(encoding_obj);
7307 return -1;
7308 }
7309
7310 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7311 pusedDefaultChar = &usedDefaultChar;
7312 else
7313 pusedDefaultChar = NULL;
7314
7315 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7316 PyErr_NoMemory();
7317 goto error;
7318 }
7319 outsize = insize * Py_ARRAY_LENGTH(buffer);
7320
7321 if (*outbytes == NULL) {
7322 /* Create string object */
7323 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7324 if (*outbytes == NULL)
7325 goto error;
7326 out = PyBytes_AS_STRING(*outbytes);
7327 }
7328 else {
7329 /* Extend string object */
7330 Py_ssize_t n = PyBytes_Size(*outbytes);
7331 if (n > PY_SSIZE_T_MAX - outsize) {
7332 PyErr_NoMemory();
7333 goto error;
7334 }
7335 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7336 goto error;
7337 out = PyBytes_AS_STRING(*outbytes) + n;
7338 }
7339
7340 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007341 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007343 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7344 wchar_t chars[2];
7345 int charsize;
7346 if (ch < 0x10000) {
7347 chars[0] = (wchar_t)ch;
7348 charsize = 1;
7349 }
7350 else {
7351 ch -= 0x10000;
7352 chars[0] = 0xd800 + (ch >> 10);
7353 chars[1] = 0xdc00 + (ch & 0x3ff);
7354 charsize = 2;
7355 }
7356
Victor Stinner3a50e702011-10-18 21:21:00 +02007357 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007358 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007359 buffer, Py_ARRAY_LENGTH(buffer),
7360 NULL, pusedDefaultChar);
7361 if (outsize > 0) {
7362 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7363 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007364 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 memcpy(out, buffer, outsize);
7366 out += outsize;
7367 continue;
7368 }
7369 }
7370 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7371 PyErr_SetFromWindowsErr(0);
7372 goto error;
7373 }
7374
Victor Stinner3a50e702011-10-18 21:21:00 +02007375 rep = unicode_encode_call_errorhandler(
7376 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007377 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007378 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007379 if (rep == NULL)
7380 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007381 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007382
7383 if (PyBytes_Check(rep)) {
7384 outsize = PyBytes_GET_SIZE(rep);
7385 if (outsize != 1) {
7386 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7387 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7388 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7389 Py_DECREF(rep);
7390 goto error;
7391 }
7392 out = PyBytes_AS_STRING(*outbytes) + offset;
7393 }
7394 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7395 out += outsize;
7396 }
7397 else {
7398 Py_ssize_t i;
7399 enum PyUnicode_Kind kind;
7400 void *data;
7401
7402 if (PyUnicode_READY(rep) < 0) {
7403 Py_DECREF(rep);
7404 goto error;
7405 }
7406
7407 outsize = PyUnicode_GET_LENGTH(rep);
7408 if (outsize != 1) {
7409 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7410 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7411 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7412 Py_DECREF(rep);
7413 goto error;
7414 }
7415 out = PyBytes_AS_STRING(*outbytes) + offset;
7416 }
7417 kind = PyUnicode_KIND(rep);
7418 data = PyUnicode_DATA(rep);
7419 for (i=0; i < outsize; i++) {
7420 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7421 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007422 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007423 encoding, unicode,
7424 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007425 "unable to encode error handler result to ASCII");
7426 Py_DECREF(rep);
7427 goto error;
7428 }
7429 *out = (unsigned char)ch;
7430 out++;
7431 }
7432 }
7433 Py_DECREF(rep);
7434 }
7435 /* write a NUL byte */
7436 *out = 0;
7437 outsize = out - PyBytes_AS_STRING(*outbytes);
7438 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7439 if (_PyBytes_Resize(outbytes, outsize) < 0)
7440 goto error;
7441 ret = 0;
7442
7443error:
7444 Py_XDECREF(encoding_obj);
7445 Py_XDECREF(errorHandler);
7446 Py_XDECREF(exc);
7447 return ret;
7448}
7449
Victor Stinner3a50e702011-10-18 21:21:00 +02007450static PyObject *
7451encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007452 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 const char *errors)
7454{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007455 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007457 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007458 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007459
Victor Stinner2fc507f2011-11-04 20:06:39 +01007460 if (PyUnicode_READY(unicode) < 0)
7461 return NULL;
7462 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007463
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 if (code_page < 0) {
7465 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7466 return NULL;
7467 }
7468
Martin v. Löwis3d325192011-11-04 18:23:06 +01007469 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007470 return PyBytes_FromStringAndSize(NULL, 0);
7471
Victor Stinner7581cef2011-11-03 22:32:33 +01007472 offset = 0;
7473 do
7474 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007477 chunks. */
7478 if (len > INT_MAX/2) {
7479 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007480 done = 0;
7481 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007482 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007483#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007484 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007485 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007486 done = 1;
7487 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488
Victor Stinner76a31a62011-11-04 00:05:13 +01007489 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007490 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007491 errors);
7492 if (ret == -2)
7493 ret = encode_code_page_errors(code_page, &outbytes,
7494 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007495 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007496 if (ret < 0) {
7497 Py_XDECREF(outbytes);
7498 return NULL;
7499 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500
Victor Stinner7581cef2011-11-03 22:32:33 +01007501 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007502 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007503 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 return outbytes;
7506}
7507
7508PyObject *
7509PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7510 Py_ssize_t size,
7511 const char *errors)
7512{
Victor Stinner7581cef2011-11-03 22:32:33 +01007513 PyObject *unicode, *res;
7514 unicode = PyUnicode_FromUnicode(p, size);
7515 if (unicode == NULL)
7516 return NULL;
7517 res = encode_code_page(CP_ACP, unicode, errors);
7518 Py_DECREF(unicode);
7519 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007520}
7521
7522PyObject *
7523PyUnicode_EncodeCodePage(int code_page,
7524 PyObject *unicode,
7525 const char *errors)
7526{
Victor Stinner7581cef2011-11-03 22:32:33 +01007527 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007528}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007529
Alexander Belopolsky40018472011-02-26 01:02:56 +00007530PyObject *
7531PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007532{
7533 if (!PyUnicode_Check(unicode)) {
7534 PyErr_BadArgument();
7535 return NULL;
7536 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007537 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007538}
7539
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540#undef NEED_RETRY
7541
Victor Stinner99b95382011-07-04 14:23:54 +02007542#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007543
Guido van Rossumd57fd912000-03-10 22:53:23 +00007544/* --- Character Mapping Codec -------------------------------------------- */
7545
Alexander Belopolsky40018472011-02-26 01:02:56 +00007546PyObject *
7547PyUnicode_DecodeCharmap(const char *s,
7548 Py_ssize_t size,
7549 PyObject *mapping,
7550 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007551{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007552 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007553 Py_ssize_t startinpos;
7554 Py_ssize_t endinpos;
7555 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007556 const char *e;
Victor Stinner7931d9a2011-11-04 00:22:48 +01007557 PyObject *v;
Martin v. Löwis18e16552006-02-15 17:27:45 +00007558 Py_ssize_t extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007559 PyObject *errorHandler = NULL;
7560 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00007561
Guido van Rossumd57fd912000-03-10 22:53:23 +00007562 /* Default to Latin-1 */
7563 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007564 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007565
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007566 v = PyUnicode_New(size, 127);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007567 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007568 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007569 if (size == 0)
Victor Stinner7931d9a2011-11-04 00:22:48 +01007570 return v;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007571 outpos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007572 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007573 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007574 Py_ssize_t maplen;
7575 enum PyUnicode_Kind kind;
7576 void *data;
7577 Py_UCS4 x;
7578
7579 if (PyUnicode_READY(mapping) < 0)
7580 return NULL;
7581
7582 maplen = PyUnicode_GET_LENGTH(mapping);
7583 data = PyUnicode_DATA(mapping);
7584 kind = PyUnicode_KIND(mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 while (s < e) {
7586 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007587
Benjamin Peterson29060642009-01-31 22:14:21 +00007588 if (ch < maplen)
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007589 x = PyUnicode_READ(kind, data, ch);
7590 else
7591 x = 0xfffe; /* invalid value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007592
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007593 if (x == 0xfffe)
7594 {
Benjamin Peterson29060642009-01-31 22:14:21 +00007595 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007596 startinpos = s-starts;
7597 endinpos = startinpos+1;
7598 if (unicode_decode_call_errorhandler(
7599 errors, &errorHandler,
7600 "charmap", "character maps to <undefined>",
7601 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007602 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 goto onError;
7604 }
7605 continue;
7606 }
Victor Stinnerebf3ba82011-11-10 20:30:22 +01007607
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007608 if (unicode_putchar(&v, &outpos, x) < 0)
7609 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007610 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007611 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007612 }
7613 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007614 while (s < e) {
7615 unsigned char ch = *s;
7616 PyObject *w, *x;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007617
Benjamin Peterson29060642009-01-31 22:14:21 +00007618 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7619 w = PyLong_FromLong((long)ch);
7620 if (w == NULL)
7621 goto onError;
7622 x = PyObject_GetItem(mapping, w);
7623 Py_DECREF(w);
7624 if (x == NULL) {
7625 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7626 /* No mapping found means: mapping is undefined. */
7627 PyErr_Clear();
7628 x = Py_None;
7629 Py_INCREF(x);
7630 } else
7631 goto onError;
7632 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00007633
Benjamin Peterson29060642009-01-31 22:14:21 +00007634 /* Apply mapping */
7635 if (PyLong_Check(x)) {
7636 long value = PyLong_AS_LONG(x);
7637 if (value < 0 || value > 65535) {
7638 PyErr_SetString(PyExc_TypeError,
7639 "character mapping must be in range(65536)");
7640 Py_DECREF(x);
7641 goto onError;
7642 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007643 if (unicode_putchar(&v, &outpos, value) < 0)
7644 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007645 }
7646 else if (x == Py_None) {
7647 /* undefined mapping */
Benjamin Peterson29060642009-01-31 22:14:21 +00007648 startinpos = s-starts;
7649 endinpos = startinpos+1;
7650 if (unicode_decode_call_errorhandler(
7651 errors, &errorHandler,
7652 "charmap", "character maps to <undefined>",
7653 &starts, &e, &startinpos, &endinpos, &exc, &s,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007654 &v, &outpos)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007655 Py_DECREF(x);
7656 goto onError;
7657 }
7658 Py_DECREF(x);
7659 continue;
7660 }
7661 else if (PyUnicode_Check(x)) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007662 Py_ssize_t targetsize;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007663
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007664 if (PyUnicode_READY(x) < 0)
7665 goto onError;
7666 targetsize = PyUnicode_GET_LENGTH(x);
7667
7668 if (targetsize == 1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007669 /* 1-1 mapping */
Victor Stinner62aa4d02011-11-09 00:03:45 +01007670 if (unicode_putchar(&v, &outpos,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007671 PyUnicode_READ_CHAR(x, 0)) < 0)
7672 goto onError;
7673 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007674 else if (targetsize > 1) {
7675 /* 1-n mapping */
7676 if (targetsize > extrachars) {
7677 /* resize first */
Benjamin Peterson29060642009-01-31 22:14:21 +00007678 Py_ssize_t needed = (targetsize - extrachars) + \
7679 (targetsize << 2);
7680 extrachars += needed;
7681 /* XXX overflow detection missing */
Victor Stinner7931d9a2011-11-04 00:22:48 +01007682 if (PyUnicode_Resize(&v,
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007683 PyUnicode_GET_LENGTH(v) + needed) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007684 Py_DECREF(x);
7685 goto onError;
7686 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007687 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007688 if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
7689 goto onError;
7690 PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
7691 outpos += targetsize;
Benjamin Peterson29060642009-01-31 22:14:21 +00007692 extrachars -= targetsize;
7693 }
7694 /* 1-0 mapping: skip the character */
7695 }
7696 else {
7697 /* wrong return value */
7698 PyErr_SetString(PyExc_TypeError,
7699 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00007700 Py_DECREF(x);
7701 goto onError;
7702 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007703 Py_DECREF(x);
7704 ++s;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007706 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01007707 if (PyUnicode_Resize(&v, outpos) < 0)
Antoine Pitroua8f63c02011-11-08 18:37:16 +01007708 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007709 Py_XDECREF(errorHandler);
7710 Py_XDECREF(exc);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007711 return unicode_result(v);
Tim Petersced69f82003-09-16 20:30:58 +00007712
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007714 Py_XDECREF(errorHandler);
7715 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 Py_XDECREF(v);
7717 return NULL;
7718}
7719
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007720/* Charmap encoding: the lookup table */
7721
Alexander Belopolsky40018472011-02-26 01:02:56 +00007722struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007723 PyObject_HEAD
7724 unsigned char level1[32];
7725 int count2, count3;
7726 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007727};
7728
7729static PyObject*
7730encoding_map_size(PyObject *obj, PyObject* args)
7731{
7732 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007733 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007734 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007735}
7736
7737static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007738 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007739 PyDoc_STR("Return the size (in bytes) of this object") },
7740 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007741};
7742
7743static void
7744encoding_map_dealloc(PyObject* o)
7745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007747}
7748
7749static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 "EncodingMap", /*tp_name*/
7752 sizeof(struct encoding_map), /*tp_basicsize*/
7753 0, /*tp_itemsize*/
7754 /* methods */
7755 encoding_map_dealloc, /*tp_dealloc*/
7756 0, /*tp_print*/
7757 0, /*tp_getattr*/
7758 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007759 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007760 0, /*tp_repr*/
7761 0, /*tp_as_number*/
7762 0, /*tp_as_sequence*/
7763 0, /*tp_as_mapping*/
7764 0, /*tp_hash*/
7765 0, /*tp_call*/
7766 0, /*tp_str*/
7767 0, /*tp_getattro*/
7768 0, /*tp_setattro*/
7769 0, /*tp_as_buffer*/
7770 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7771 0, /*tp_doc*/
7772 0, /*tp_traverse*/
7773 0, /*tp_clear*/
7774 0, /*tp_richcompare*/
7775 0, /*tp_weaklistoffset*/
7776 0, /*tp_iter*/
7777 0, /*tp_iternext*/
7778 encoding_map_methods, /*tp_methods*/
7779 0, /*tp_members*/
7780 0, /*tp_getset*/
7781 0, /*tp_base*/
7782 0, /*tp_dict*/
7783 0, /*tp_descr_get*/
7784 0, /*tp_descr_set*/
7785 0, /*tp_dictoffset*/
7786 0, /*tp_init*/
7787 0, /*tp_alloc*/
7788 0, /*tp_new*/
7789 0, /*tp_free*/
7790 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007791};
7792
7793PyObject*
7794PyUnicode_BuildEncodingMap(PyObject* string)
7795{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007796 PyObject *result;
7797 struct encoding_map *mresult;
7798 int i;
7799 int need_dict = 0;
7800 unsigned char level1[32];
7801 unsigned char level2[512];
7802 unsigned char *mlevel1, *mlevel2, *mlevel3;
7803 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007804 int kind;
7805 void *data;
7806 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007808 if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007809 PyErr_BadArgument();
7810 return NULL;
7811 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007812 kind = PyUnicode_KIND(string);
7813 data = PyUnicode_DATA(string);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007814 memset(level1, 0xFF, sizeof level1);
7815 memset(level2, 0xFF, sizeof level2);
7816
7817 /* If there isn't a one-to-one mapping of NULL to \0,
7818 or if there are non-BMP characters, we need to use
7819 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007821 need_dict = 1;
7822 for (i = 1; i < 256; i++) {
7823 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 ch = PyUnicode_READ(kind, data, i);
7825 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007826 need_dict = 1;
7827 break;
7828 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007829 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 /* unmapped character */
7831 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007832 l1 = ch >> 11;
7833 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834 if (level1[l1] == 0xFF)
7835 level1[l1] = count2++;
7836 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007837 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007838 }
7839
7840 if (count2 >= 0xFF || count3 >= 0xFF)
7841 need_dict = 1;
7842
7843 if (need_dict) {
7844 PyObject *result = PyDict_New();
7845 PyObject *key, *value;
7846 if (!result)
7847 return NULL;
7848 for (i = 0; i < 256; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007849 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007850 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851 if (!key || !value)
7852 goto failed1;
7853 if (PyDict_SetItem(result, key, value) == -1)
7854 goto failed1;
7855 Py_DECREF(key);
7856 Py_DECREF(value);
7857 }
7858 return result;
7859 failed1:
7860 Py_XDECREF(key);
7861 Py_XDECREF(value);
7862 Py_DECREF(result);
7863 return NULL;
7864 }
7865
7866 /* Create a three-level trie */
7867 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7868 16*count2 + 128*count3 - 1);
7869 if (!result)
7870 return PyErr_NoMemory();
7871 PyObject_Init(result, &EncodingMapType);
7872 mresult = (struct encoding_map*)result;
7873 mresult->count2 = count2;
7874 mresult->count3 = count3;
7875 mlevel1 = mresult->level1;
7876 mlevel2 = mresult->level23;
7877 mlevel3 = mresult->level23 + 16*count2;
7878 memcpy(mlevel1, level1, 32);
7879 memset(mlevel2, 0xFF, 16*count2);
7880 memset(mlevel3, 0, 128*count3);
7881 count3 = 0;
7882 for (i = 1; i < 256; i++) {
7883 int o1, o2, o3, i2, i3;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007884 if (PyUnicode_READ(kind, data, i) == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 /* unmapped character */
7886 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 o1 = PyUnicode_READ(kind, data, i)>>11;
7888 o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 i2 = 16*mlevel1[o1] + o2;
7890 if (mlevel2[i2] == 0xFF)
7891 mlevel2[i2] = count3++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007892 o3 = PyUnicode_READ(kind, data, i) & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 i3 = 128*mlevel2[i2] + o3;
7894 mlevel3[i3] = i;
7895 }
7896 return result;
7897}
7898
7899static int
Victor Stinner22168992011-11-20 17:09:18 +01007900encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007901{
7902 struct encoding_map *map = (struct encoding_map*)mapping;
7903 int l1 = c>>11;
7904 int l2 = (c>>7) & 0xF;
7905 int l3 = c & 0x7F;
7906 int i;
7907
Victor Stinner22168992011-11-20 17:09:18 +01007908 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007909 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 if (c == 0)
7911 return 0;
7912 /* level 1*/
7913 i = map->level1[l1];
7914 if (i == 0xFF) {
7915 return -1;
7916 }
7917 /* level 2*/
7918 i = map->level23[16*i+l2];
7919 if (i == 0xFF) {
7920 return -1;
7921 }
7922 /* level 3 */
7923 i = map->level23[16*map->count2 + 128*i + l3];
7924 if (i == 0) {
7925 return -1;
7926 }
7927 return i;
7928}
7929
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007930/* Lookup the character ch in the mapping. If the character
7931 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007932 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007933static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007934charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007935{
Christian Heimes217cfd12007-12-02 14:31:20 +00007936 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007937 PyObject *x;
7938
7939 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007940 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007941 x = PyObject_GetItem(mapping, w);
7942 Py_DECREF(w);
7943 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007944 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7945 /* No mapping found means: mapping is undefined. */
7946 PyErr_Clear();
7947 x = Py_None;
7948 Py_INCREF(x);
7949 return x;
7950 } else
7951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007953 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007954 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007955 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007956 long value = PyLong_AS_LONG(x);
7957 if (value < 0 || value > 255) {
7958 PyErr_SetString(PyExc_TypeError,
7959 "character mapping must be in range(256)");
7960 Py_DECREF(x);
7961 return NULL;
7962 }
7963 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007964 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007965 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007966 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007967 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 /* wrong return value */
7969 PyErr_Format(PyExc_TypeError,
7970 "character mapping must return integer, bytes or None, not %.400s",
7971 x->ob_type->tp_name);
7972 Py_DECREF(x);
7973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007974 }
7975}
7976
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007977static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007978charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007979{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007980 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7981 /* exponentially overallocate to minimize reallocations */
7982 if (requiredsize < 2*outsize)
7983 requiredsize = 2*outsize;
7984 if (_PyBytes_Resize(outobj, requiredsize))
7985 return -1;
7986 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007987}
7988
Benjamin Peterson14339b62009-01-31 16:36:08 +00007989typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00007990 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00007991} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007992/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00007993 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007994 space is available. Return a new reference to the object that
7995 was put in the output buffer, or Py_None, if the mapping was undefined
7996 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00007997 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007998static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01007999charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008000 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008002 PyObject *rep;
8003 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008004 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008005
Christian Heimes90aa7642007-12-19 02:45:37 +00008006 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008007 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008008 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008009 if (res == -1)
8010 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 if (outsize<requiredsize)
8012 if (charmapencode_resize(outobj, outpos, requiredsize))
8013 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008014 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 outstart[(*outpos)++] = (char)res;
8016 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008017 }
8018
8019 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008020 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008021 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008022 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008023 Py_DECREF(rep);
8024 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008025 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 if (PyLong_Check(rep)) {
8027 Py_ssize_t requiredsize = *outpos+1;
8028 if (outsize<requiredsize)
8029 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8030 Py_DECREF(rep);
8031 return enc_EXCEPTION;
8032 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008033 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008034 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008035 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 else {
8037 const char *repchars = PyBytes_AS_STRING(rep);
8038 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8039 Py_ssize_t requiredsize = *outpos+repsize;
8040 if (outsize<requiredsize)
8041 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8042 Py_DECREF(rep);
8043 return enc_EXCEPTION;
8044 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008045 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008046 memcpy(outstart + *outpos, repchars, repsize);
8047 *outpos += repsize;
8048 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008049 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008050 Py_DECREF(rep);
8051 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052}
8053
8054/* handle an error in PyUnicode_EncodeCharmap
8055 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056static int
8057charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008058 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008059 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008060 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008061 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008062{
8063 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008064 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008065 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008066 enum PyUnicode_Kind kind;
8067 void *data;
8068 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008070 Py_ssize_t collstartpos = *inpos;
8071 Py_ssize_t collendpos = *inpos+1;
8072 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073 char *encoding = "charmap";
8074 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008075 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008076 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008077 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008078
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008079 if (PyUnicode_READY(unicode) < 0)
8080 return -1;
8081 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008082 /* find all unencodable characters */
8083 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008085 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008086 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008087 val = encoding_map_lookup(ch, mapping);
8088 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008089 break;
8090 ++collendpos;
8091 continue;
8092 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008093
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008094 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8095 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 if (rep==NULL)
8097 return -1;
8098 else if (rep!=Py_None) {
8099 Py_DECREF(rep);
8100 break;
8101 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008102 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008103 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008104 }
8105 /* cache callback name lookup
8106 * (if not done yet, i.e. it's the first error) */
8107 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 if ((errors==NULL) || (!strcmp(errors, "strict")))
8109 *known_errorHandler = 1;
8110 else if (!strcmp(errors, "replace"))
8111 *known_errorHandler = 2;
8112 else if (!strcmp(errors, "ignore"))
8113 *known_errorHandler = 3;
8114 else if (!strcmp(errors, "xmlcharrefreplace"))
8115 *known_errorHandler = 4;
8116 else
8117 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 }
8119 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008120 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008121 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008122 return -1;
8123 case 2: /* replace */
8124 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 x = charmapencode_output('?', mapping, res, respos);
8126 if (x==enc_EXCEPTION) {
8127 return -1;
8128 }
8129 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008130 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008131 return -1;
8132 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008133 }
8134 /* fall through */
8135 case 3: /* ignore */
8136 *inpos = collendpos;
8137 break;
8138 case 4: /* xmlcharrefreplace */
8139 /* generate replacement (temporarily (mis)uses p) */
8140 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008141 char buffer[2+29+1+1];
8142 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008143 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008144 for (cp = buffer; *cp; ++cp) {
8145 x = charmapencode_output(*cp, mapping, res, respos);
8146 if (x==enc_EXCEPTION)
8147 return -1;
8148 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008149 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008150 return -1;
8151 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152 }
8153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008154 *inpos = collendpos;
8155 break;
8156 default:
8157 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008158 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008159 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008160 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008162 if (PyBytes_Check(repunicode)) {
8163 /* Directly copy bytes result to output. */
8164 Py_ssize_t outsize = PyBytes_Size(*res);
8165 Py_ssize_t requiredsize;
8166 repsize = PyBytes_Size(repunicode);
8167 requiredsize = *respos + repsize;
8168 if (requiredsize > outsize)
8169 /* Make room for all additional bytes. */
8170 if (charmapencode_resize(res, respos, requiredsize)) {
8171 Py_DECREF(repunicode);
8172 return -1;
8173 }
8174 memcpy(PyBytes_AsString(*res) + *respos,
8175 PyBytes_AsString(repunicode), repsize);
8176 *respos += repsize;
8177 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008178 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008179 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008180 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 /* generate replacement */
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008182 if (PyUnicode_READY(repunicode) < 0) {
8183 Py_DECREF(repunicode);
8184 return -1;
8185 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008186 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008187 data = PyUnicode_DATA(repunicode);
8188 kind = PyUnicode_KIND(repunicode);
8189 for (index = 0; index < repsize; index++) {
8190 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8191 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008192 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008193 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008194 return -1;
8195 }
8196 else if (x==enc_FAILED) {
8197 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008198 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008199 return -1;
8200 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 }
8202 *inpos = newpos;
8203 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008204 }
8205 return 0;
8206}
8207
Alexander Belopolsky40018472011-02-26 01:02:56 +00008208PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008209_PyUnicode_EncodeCharmap(PyObject *unicode,
8210 PyObject *mapping,
8211 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008212{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008213 /* output object */
8214 PyObject *res = NULL;
8215 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008216 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008217 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008218 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008219 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008220 PyObject *errorHandler = NULL;
8221 PyObject *exc = NULL;
8222 /* the following variable is used for caching string comparisons
8223 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8224 * 3=ignore, 4=xmlcharrefreplace */
8225 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008226
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008227 if (PyUnicode_READY(unicode) < 0)
8228 return NULL;
8229 size = PyUnicode_GET_LENGTH(unicode);
8230
Guido van Rossumd57fd912000-03-10 22:53:23 +00008231 /* Default to Latin-1 */
8232 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008233 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008234
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 /* allocate enough for a simple encoding without
8236 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008237 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008238 if (res == NULL)
8239 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008240 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008241 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008242
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008243 while (inpos<size) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008244 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008245 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008246 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008247 if (x==enc_EXCEPTION) /* error */
8248 goto onError;
8249 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008250 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 &exc,
8252 &known_errorHandler, &errorHandler, errors,
8253 &res, &respos)) {
8254 goto onError;
8255 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008256 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008257 else
8258 /* done with this character => adjust input position */
8259 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008261
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008262 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008263 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008264 if (_PyBytes_Resize(&res, respos) < 0)
8265 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008266
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008267 Py_XDECREF(exc);
8268 Py_XDECREF(errorHandler);
8269 return res;
8270
Benjamin Peterson29060642009-01-31 22:14:21 +00008271 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 Py_XDECREF(res);
8273 Py_XDECREF(exc);
8274 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008275 return NULL;
8276}
8277
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008278/* Deprecated */
8279PyObject *
8280PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8281 Py_ssize_t size,
8282 PyObject *mapping,
8283 const char *errors)
8284{
8285 PyObject *result;
8286 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8287 if (unicode == NULL)
8288 return NULL;
8289 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8290 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008291 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008292}
8293
Alexander Belopolsky40018472011-02-26 01:02:56 +00008294PyObject *
8295PyUnicode_AsCharmapString(PyObject *unicode,
8296 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297{
8298 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008299 PyErr_BadArgument();
8300 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008301 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008302 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303}
8304
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008306static void
8307make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008308 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008309 Py_ssize_t startpos, Py_ssize_t endpos,
8310 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008313 *exceptionObject = _PyUnicodeTranslateError_Create(
8314 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 }
8316 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8318 goto onError;
8319 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8320 goto onError;
8321 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8322 goto onError;
8323 return;
8324 onError:
8325 Py_DECREF(*exceptionObject);
8326 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 }
8328}
8329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330/* raises a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331static void
8332raise_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008334 Py_ssize_t startpos, Py_ssize_t endpos,
8335 const char *reason)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336{
8337 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008339 if (*exceptionObject != NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008340 PyCodec_StrictErrors(*exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008341}
8342
8343/* error handling callback helper:
8344 build arguments, call the callback and check the arguments,
8345 put the result into newpos and return the replacement string, which
8346 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008347static PyObject *
8348unicode_translate_call_errorhandler(const char *errors,
8349 PyObject **errorHandler,
8350 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008351 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352 Py_ssize_t startpos, Py_ssize_t endpos,
8353 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008355 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008357 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358 PyObject *restuple;
8359 PyObject *resunicode;
8360
8361 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008364 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 }
8366
8367 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008368 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371
8372 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008377 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 Py_DECREF(restuple);
8379 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 }
8381 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 &resunicode, &i_newpos)) {
8383 Py_DECREF(restuple);
8384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008386 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008387 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008388 else
8389 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008390 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8392 Py_DECREF(restuple);
8393 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008394 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008395 Py_INCREF(resunicode);
8396 Py_DECREF(restuple);
8397 return resunicode;
8398}
8399
8400/* Lookup the character ch in the mapping and put the result in result,
8401 which must be decrefed by the caller.
8402 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008403static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008404charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405{
Christian Heimes217cfd12007-12-02 14:31:20 +00008406 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 PyObject *x;
8408
8409 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411 x = PyObject_GetItem(mapping, w);
8412 Py_DECREF(w);
8413 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8415 /* No mapping found means: use 1:1 mapping. */
8416 PyErr_Clear();
8417 *result = NULL;
8418 return 0;
8419 } else
8420 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421 }
8422 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 *result = x;
8424 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008426 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 long value = PyLong_AS_LONG(x);
8428 long max = PyUnicode_GetMax();
8429 if (value < 0 || value > max) {
8430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5a2f7e602007-10-24 21:13:09 +00008431 "character mapping must be in range(0x%x)", max+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 Py_DECREF(x);
8433 return -1;
8434 }
8435 *result = x;
8436 return 0;
8437 }
8438 else if (PyUnicode_Check(x)) {
8439 *result = x;
8440 return 0;
8441 }
8442 else {
8443 /* wrong return value */
8444 PyErr_SetString(PyExc_TypeError,
8445 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008446 Py_DECREF(x);
8447 return -1;
8448 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008449}
8450/* ensure that *outobj is at least requiredsize characters long,
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 if not reallocate and adjust various state variables.
8452 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 Py_ssize_t requiredsize)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008457 Py_ssize_t oldsize = *psize;
Walter Dörwald4894c302003-10-24 14:25:28 +00008458 if (requiredsize > oldsize) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 /* exponentially overallocate to minimize reallocations */
8460 if (requiredsize < 2 * oldsize)
8461 requiredsize = 2 * oldsize;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008462 *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8463 if (*outobj == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008465 *psize = requiredsize;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 }
8467 return 0;
8468}
8469/* lookup the character, put the result in the output string and adjust
8470 various state variables. Return a new reference to the object that
8471 was put in the output buffer in *result, or Py_None, if the mapping was
8472 undefined (in which case no character was written).
8473 The called must decref result.
8474 Return 0 on success, -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008475static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008476charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8477 PyObject *mapping, Py_UCS4 **output,
8478 Py_ssize_t *osize, Py_ssize_t *opos,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008479 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008481 Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8482 if (charmaptranslate_lookup(curinp, mapping, res))
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484 if (*res==NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 /* not found => default to 1:1 mapping */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008486 (*output)[(*opos)++] = curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008487 }
8488 else if (*res==Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 ;
Christian Heimes217cfd12007-12-02 14:31:20 +00008490 else if (PyLong_Check(*res)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008491 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008492 (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008493 }
8494 else if (PyUnicode_Check(*res)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008495 Py_ssize_t repsize;
8496 if (PyUnicode_READY(*res) == -1)
8497 return -1;
8498 repsize = PyUnicode_GET_LENGTH(*res);
Benjamin Peterson29060642009-01-31 22:14:21 +00008499 if (repsize==1) {
8500 /* no overflow check, because we know that the space is enough */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 }
8503 else if (repsize!=0) {
8504 /* more than one character */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008505 Py_ssize_t requiredsize = *opos +
8506 (PyUnicode_GET_LENGTH(input) - ipos) +
Benjamin Peterson29060642009-01-31 22:14:21 +00008507 repsize - 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 Py_ssize_t i;
8509 if (charmaptranslate_makespace(output, osize, requiredsize))
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 for(i = 0; i < repsize; i++)
8512 (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008514 }
8515 else
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008517 return 0;
8518}
8519
Alexander Belopolsky40018472011-02-26 01:02:56 +00008520PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521_PyUnicode_TranslateCharmap(PyObject *input,
8522 PyObject *mapping,
8523 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008524{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008525 /* input object */
8526 char *idata;
8527 Py_ssize_t size, i;
8528 int kind;
8529 /* output buffer */
8530 Py_UCS4 *output = NULL;
8531 Py_ssize_t osize;
8532 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008533 /* current output position */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008534 Py_ssize_t opos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008535 char *reason = "character maps to <undefined>";
8536 PyObject *errorHandler = NULL;
8537 PyObject *exc = NULL;
8538 /* the following variable is used for caching string comparisons
8539 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8540 * 3=ignore, 4=xmlcharrefreplace */
8541 int known_errorHandler = -1;
8542
Guido van Rossumd57fd912000-03-10 22:53:23 +00008543 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008544 PyErr_BadArgument();
8545 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008546 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008548 if (PyUnicode_READY(input) == -1)
8549 return NULL;
8550 idata = (char*)PyUnicode_DATA(input);
8551 kind = PyUnicode_KIND(input);
8552 size = PyUnicode_GET_LENGTH(input);
8553 i = 0;
8554
8555 if (size == 0) {
8556 Py_INCREF(input);
8557 return input;
8558 }
8559
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008560 /* allocate enough for a simple 1:1 translation without
8561 replacements, if we need more, we'll resize */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008562 osize = size;
8563 output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8564 opos = 0;
8565 if (output == NULL) {
8566 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00008567 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008568 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008570 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008571 /* try to encode it */
8572 PyObject *x = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008573 if (charmaptranslate_output(input, i, mapping,
8574 &output, &osize, &opos, &x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 Py_XDECREF(x);
8576 goto onError;
8577 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 Py_XDECREF(x);
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 if (x!=Py_None) /* it worked => adjust input pointer */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008580 ++i;
Benjamin Peterson29060642009-01-31 22:14:21 +00008581 else { /* untranslatable character */
8582 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8583 Py_ssize_t repsize;
8584 Py_ssize_t newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008585 Py_ssize_t uni2;
Benjamin Peterson29060642009-01-31 22:14:21 +00008586 /* startpos for collecting untranslatable chars */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 Py_ssize_t collstart = i;
8588 Py_ssize_t collend = i+1;
8589 Py_ssize_t coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 /* find all untranslatable characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008592 while (collend < size) {
8593 if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008594 goto onError;
8595 Py_XDECREF(x);
8596 if (x!=Py_None)
8597 break;
8598 ++collend;
8599 }
8600 /* cache callback name lookup
8601 * (if not done yet, i.e. it's the first error) */
8602 if (known_errorHandler==-1) {
8603 if ((errors==NULL) || (!strcmp(errors, "strict")))
8604 known_errorHandler = 1;
8605 else if (!strcmp(errors, "replace"))
8606 known_errorHandler = 2;
8607 else if (!strcmp(errors, "ignore"))
8608 known_errorHandler = 3;
8609 else if (!strcmp(errors, "xmlcharrefreplace"))
8610 known_errorHandler = 4;
8611 else
8612 known_errorHandler = 0;
8613 }
8614 switch (known_errorHandler) {
8615 case 1: /* strict */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008616 raise_translate_exception(&exc, input, collstart,
8617 collend, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008618 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 case 2: /* replace */
8620 /* No need to check for space, this is a 1:1 replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008621 for (coll = collstart; coll<collend; coll++)
8622 output[opos++] = '?';
Benjamin Peterson29060642009-01-31 22:14:21 +00008623 /* fall through */
8624 case 3: /* ignore */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008625 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008626 break;
8627 case 4: /* xmlcharrefreplace */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 /* generate replacement (temporarily (mis)uses i) */
8629 for (i = collstart; i < collend; ++i) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 char buffer[2+29+1+1];
8631 char *cp;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8633 if (charmaptranslate_makespace(&output, &osize,
8634 opos+strlen(buffer)+(size-collend)))
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 goto onError;
8636 for (cp = buffer; *cp; ++cp)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008637 output[opos++] = *cp;
Benjamin Peterson29060642009-01-31 22:14:21 +00008638 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008639 i = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 break;
8641 default:
8642 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008643 reason, input, &exc,
8644 collstart, collend, &newpos);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008645 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008646 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008647 if (PyUnicode_READY(repunicode) < 0) {
8648 Py_DECREF(repunicode);
8649 goto onError;
8650 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008651 /* generate replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008652 repsize = PyUnicode_GET_LENGTH(repunicode);
8653 if (charmaptranslate_makespace(&output, &osize,
8654 opos+repsize+(size-collend))) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 Py_DECREF(repunicode);
8656 goto onError;
8657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 for (uni2 = 0; repsize-->0; ++uni2)
8659 output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8660 i = newpos;
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008662 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008663 }
8664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665 res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8666 if (!res)
8667 goto onError;
8668 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669 Py_XDECREF(exc);
8670 Py_XDECREF(errorHandler);
8671 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 PyMem_Free(output);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 Py_XDECREF(exc);
8676 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008677 return NULL;
8678}
8679
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008680/* Deprecated. Use PyUnicode_Translate instead. */
8681PyObject *
8682PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8683 Py_ssize_t size,
8684 PyObject *mapping,
8685 const char *errors)
8686{
8687 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8688 if (!unicode)
8689 return NULL;
8690 return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8691}
8692
Alexander Belopolsky40018472011-02-26 01:02:56 +00008693PyObject *
8694PyUnicode_Translate(PyObject *str,
8695 PyObject *mapping,
8696 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697{
8698 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008699
Guido van Rossumd57fd912000-03-10 22:53:23 +00008700 str = PyUnicode_FromObject(str);
8701 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008703 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008704 Py_DECREF(str);
8705 return result;
Tim Petersced69f82003-09-16 20:30:58 +00008706
Benjamin Peterson29060642009-01-31 22:14:21 +00008707 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708 Py_XDECREF(str);
8709 return NULL;
8710}
Tim Petersced69f82003-09-16 20:30:58 +00008711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008713fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714{
8715 /* No need to call PyUnicode_READY(self) because this function is only
8716 called as a callback from fixup() which does it already. */
8717 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8718 const int kind = PyUnicode_KIND(self);
8719 void *data = PyUnicode_DATA(self);
8720 Py_UCS4 maxchar = 0, ch, fixed;
8721 Py_ssize_t i;
8722
8723 for (i = 0; i < len; ++i) {
8724 ch = PyUnicode_READ(kind, data, i);
8725 fixed = 0;
8726 if (ch > 127) {
8727 if (Py_UNICODE_ISSPACE(ch))
8728 fixed = ' ';
8729 else {
8730 const int decimal = Py_UNICODE_TODECIMAL(ch);
8731 if (decimal >= 0)
8732 fixed = '0' + decimal;
8733 }
8734 if (fixed != 0) {
8735 if (fixed > maxchar)
8736 maxchar = fixed;
8737 PyUnicode_WRITE(kind, data, i, fixed);
8738 }
8739 else if (ch > maxchar)
8740 maxchar = ch;
8741 }
8742 else if (ch > maxchar)
8743 maxchar = ch;
8744 }
8745
8746 return maxchar;
8747}
8748
8749PyObject *
8750_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8751{
8752 if (!PyUnicode_Check(unicode)) {
8753 PyErr_BadInternalCall();
8754 return NULL;
8755 }
8756 if (PyUnicode_READY(unicode) == -1)
8757 return NULL;
8758 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8759 /* If the string is already ASCII, just return the same string */
8760 Py_INCREF(unicode);
8761 return unicode;
8762 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008763 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008764}
8765
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008766PyObject *
8767PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8768 Py_ssize_t length)
8769{
Victor Stinnerf0124502011-11-21 23:12:56 +01008770 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008771 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008772 Py_UCS4 maxchar;
8773 enum PyUnicode_Kind kind;
8774 void *data;
8775
8776 maxchar = 0;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008777 for (i = 0; i < length; i++) {
Victor Stinnerf0124502011-11-21 23:12:56 +01008778 Py_UNICODE ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008779 if (ch > 127) {
8780 int decimal = Py_UNICODE_TODECIMAL(ch);
8781 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008782 ch = '0' + decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008783 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008784 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008785 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008786
8787 /* Copy to a new string */
8788 decimal = PyUnicode_New(length, maxchar);
8789 if (decimal == NULL)
8790 return decimal;
8791 kind = PyUnicode_KIND(decimal);
8792 data = PyUnicode_DATA(decimal);
8793 /* Iterate over code points */
8794 for (i = 0; i < length; i++) {
8795 Py_UNICODE ch = s[i];
8796 if (ch > 127) {
8797 int decimal = Py_UNICODE_TODECIMAL(ch);
8798 if (decimal >= 0)
8799 ch = '0' + decimal;
8800 }
8801 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008803 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008804}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008805/* --- Decimal Encoder ---------------------------------------------------- */
8806
Alexander Belopolsky40018472011-02-26 01:02:56 +00008807int
8808PyUnicode_EncodeDecimal(Py_UNICODE *s,
8809 Py_ssize_t length,
8810 char *output,
8811 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008812{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008813 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008814 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008815 enum PyUnicode_Kind kind;
8816 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008817
8818 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008819 PyErr_BadArgument();
8820 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008821 }
8822
Victor Stinner42bf7752011-11-21 22:52:58 +01008823 unicode = PyUnicode_FromUnicode(s, length);
8824 if (unicode == NULL)
8825 return -1;
8826
Victor Stinner6345be92011-11-25 20:09:01 +01008827 if (PyUnicode_READY(unicode) < 0) {
8828 Py_DECREF(unicode);
8829 return -1;
8830 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008831 kind = PyUnicode_KIND(unicode);
8832 data = PyUnicode_DATA(unicode);
8833
Victor Stinnerb84d7232011-11-22 01:50:07 +01008834 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008835 PyObject *exc;
8836 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008838 Py_ssize_t startpos;
8839
8840 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008841
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008843 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008844 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008846 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008847 decimal = Py_UNICODE_TODECIMAL(ch);
8848 if (decimal >= 0) {
8849 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008850 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 continue;
8852 }
8853 if (0 < ch && ch < 256) {
8854 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008855 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008856 continue;
8857 }
Victor Stinner6345be92011-11-25 20:09:01 +01008858
Victor Stinner42bf7752011-11-21 22:52:58 +01008859 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008860 exc = NULL;
8861 raise_encode_exception(&exc, "decimal", unicode,
8862 startpos, startpos+1,
8863 "invalid decimal Unicode string");
8864 Py_XDECREF(exc);
8865 Py_DECREF(unicode);
8866 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008867 }
8868 /* 0-terminate the output string */
8869 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008870 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008871 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008872}
8873
Guido van Rossumd57fd912000-03-10 22:53:23 +00008874/* --- Helpers ------------------------------------------------------------ */
8875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008876static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008877any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008878 Py_ssize_t start,
8879 Py_ssize_t end)
8880{
8881 int kind1, kind2, kind;
8882 void *buf1, *buf2;
8883 Py_ssize_t len1, len2, result;
8884
8885 kind1 = PyUnicode_KIND(s1);
8886 kind2 = PyUnicode_KIND(s2);
8887 kind = kind1 > kind2 ? kind1 : kind2;
8888 buf1 = PyUnicode_DATA(s1);
8889 buf2 = PyUnicode_DATA(s2);
8890 if (kind1 != kind)
8891 buf1 = _PyUnicode_AsKind(s1, kind);
8892 if (!buf1)
8893 return -2;
8894 if (kind2 != kind)
8895 buf2 = _PyUnicode_AsKind(s2, kind);
8896 if (!buf2) {
8897 if (kind1 != kind) PyMem_Free(buf1);
8898 return -2;
8899 }
8900 len1 = PyUnicode_GET_LENGTH(s1);
8901 len2 = PyUnicode_GET_LENGTH(s2);
8902
Victor Stinner794d5672011-10-10 03:21:36 +02008903 if (direction > 0) {
8904 switch(kind) {
8905 case PyUnicode_1BYTE_KIND:
8906 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8907 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8908 else
8909 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8910 break;
8911 case PyUnicode_2BYTE_KIND:
8912 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8913 break;
8914 case PyUnicode_4BYTE_KIND:
8915 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8916 break;
8917 default:
8918 assert(0); result = -2;
8919 }
8920 }
8921 else {
8922 switch(kind) {
8923 case PyUnicode_1BYTE_KIND:
8924 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8925 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8926 else
8927 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8928 break;
8929 case PyUnicode_2BYTE_KIND:
8930 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8931 break;
8932 case PyUnicode_4BYTE_KIND:
8933 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8934 break;
8935 default:
8936 assert(0); result = -2;
8937 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008938 }
8939
8940 if (kind1 != kind)
8941 PyMem_Free(buf1);
8942 if (kind2 != kind)
8943 PyMem_Free(buf2);
8944
8945 return result;
8946}
8947
8948Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02008949_PyUnicode_InsertThousandsGrouping(PyObject *unicode, int kind, void *data,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950 Py_ssize_t n_buffer,
8951 void *digits, Py_ssize_t n_digits,
8952 Py_ssize_t min_width,
8953 const char *grouping,
8954 const char *thousands_sep)
8955{
8956 switch(kind) {
8957 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02008958 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8959 return _PyUnicode_ascii_InsertThousandsGrouping(
8960 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8961 min_width, grouping, thousands_sep);
8962 else
8963 return _PyUnicode_ucs1_InsertThousandsGrouping(
8964 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8965 min_width, grouping, thousands_sep);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 case PyUnicode_2BYTE_KIND:
8967 return _PyUnicode_ucs2_InsertThousandsGrouping(
8968 (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8969 min_width, grouping, thousands_sep);
8970 case PyUnicode_4BYTE_KIND:
8971 return _PyUnicode_ucs4_InsertThousandsGrouping(
8972 (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8973 min_width, grouping, thousands_sep);
8974 }
8975 assert(0);
8976 return -1;
8977}
8978
8979
Thomas Wouters477c8d52006-05-27 19:21:47 +00008980/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00008981#define ADJUST_INDICES(start, end, len) \
8982 if (end > len) \
8983 end = len; \
8984 else if (end < 0) { \
8985 end += len; \
8986 if (end < 0) \
8987 end = 0; \
8988 } \
8989 if (start < 0) { \
8990 start += len; \
8991 if (start < 0) \
8992 start = 0; \
8993 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00008994
Alexander Belopolsky40018472011-02-26 01:02:56 +00008995Py_ssize_t
8996PyUnicode_Count(PyObject *str,
8997 PyObject *substr,
8998 Py_ssize_t start,
8999 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009001 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009002 PyObject* str_obj;
9003 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009004 int kind1, kind2, kind;
9005 void *buf1 = NULL, *buf2 = NULL;
9006 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009007
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009008 str_obj = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009011 sub_obj = PyUnicode_FromObject(substr);
Victor Stinnere9a29352011-10-01 02:14:59 +02009012 if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009013 Py_DECREF(str_obj);
9014 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009015 }
Tim Petersced69f82003-09-16 20:30:58 +00009016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 kind1 = PyUnicode_KIND(str_obj);
9018 kind2 = PyUnicode_KIND(sub_obj);
9019 kind = kind1 > kind2 ? kind1 : kind2;
9020 buf1 = PyUnicode_DATA(str_obj);
9021 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009022 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 if (!buf1)
9024 goto onError;
9025 buf2 = PyUnicode_DATA(sub_obj);
9026 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009027 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009028 if (!buf2)
9029 goto onError;
9030 len1 = PyUnicode_GET_LENGTH(str_obj);
9031 len2 = PyUnicode_GET_LENGTH(sub_obj);
9032
9033 ADJUST_INDICES(start, end, len1);
9034 switch(kind) {
9035 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009036 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9037 result = asciilib_count(
9038 ((Py_UCS1*)buf1) + start, end - start,
9039 buf2, len2, PY_SSIZE_T_MAX
9040 );
9041 else
9042 result = ucs1lib_count(
9043 ((Py_UCS1*)buf1) + start, end - start,
9044 buf2, len2, PY_SSIZE_T_MAX
9045 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009046 break;
9047 case PyUnicode_2BYTE_KIND:
9048 result = ucs2lib_count(
9049 ((Py_UCS2*)buf1) + start, end - start,
9050 buf2, len2, PY_SSIZE_T_MAX
9051 );
9052 break;
9053 case PyUnicode_4BYTE_KIND:
9054 result = ucs4lib_count(
9055 ((Py_UCS4*)buf1) + start, end - start,
9056 buf2, len2, PY_SSIZE_T_MAX
9057 );
9058 break;
9059 default:
9060 assert(0); result = 0;
9061 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009062
9063 Py_DECREF(sub_obj);
9064 Py_DECREF(str_obj);
9065
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 if (kind1 != kind)
9067 PyMem_Free(buf1);
9068 if (kind2 != kind)
9069 PyMem_Free(buf2);
9070
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072 onError:
9073 Py_DECREF(sub_obj);
9074 Py_DECREF(str_obj);
9075 if (kind1 != kind && buf1)
9076 PyMem_Free(buf1);
9077 if (kind2 != kind && buf2)
9078 PyMem_Free(buf2);
9079 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009080}
9081
Alexander Belopolsky40018472011-02-26 01:02:56 +00009082Py_ssize_t
9083PyUnicode_Find(PyObject *str,
9084 PyObject *sub,
9085 Py_ssize_t start,
9086 Py_ssize_t end,
9087 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009089 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009090
Guido van Rossumd57fd912000-03-10 22:53:23 +00009091 str = PyUnicode_FromObject(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092 if (!str || PyUnicode_READY(str) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009093 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009094 sub = PyUnicode_FromObject(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 if (!sub || PyUnicode_READY(sub) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009096 Py_DECREF(str);
9097 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098 }
Tim Petersced69f82003-09-16 20:30:58 +00009099
Victor Stinner794d5672011-10-10 03:21:36 +02009100 result = any_find_slice(direction,
9101 str, sub, start, end
9102 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009103
Guido van Rossumd57fd912000-03-10 22:53:23 +00009104 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009105 Py_DECREF(sub);
9106
Guido van Rossumd57fd912000-03-10 22:53:23 +00009107 return result;
9108}
9109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110Py_ssize_t
9111PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9112 Py_ssize_t start, Py_ssize_t end,
9113 int direction)
9114{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009116 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009117 if (PyUnicode_READY(str) == -1)
9118 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009119 if (start < 0 || end < 0) {
9120 PyErr_SetString(PyExc_IndexError, "string index out of range");
9121 return -2;
9122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009123 if (end > PyUnicode_GET_LENGTH(str))
9124 end = PyUnicode_GET_LENGTH(str);
9125 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009126 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9127 kind, end-start, ch, direction);
9128 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009129 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009130 else
9131 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009132}
9133
Alexander Belopolsky40018472011-02-26 01:02:56 +00009134static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009135tailmatch(PyObject *self,
9136 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009137 Py_ssize_t start,
9138 Py_ssize_t end,
9139 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141 int kind_self;
9142 int kind_sub;
9143 void *data_self;
9144 void *data_sub;
9145 Py_ssize_t offset;
9146 Py_ssize_t i;
9147 Py_ssize_t end_sub;
9148
9149 if (PyUnicode_READY(self) == -1 ||
9150 PyUnicode_READY(substring) == -1)
9151 return 0;
9152
9153 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009154 return 1;
9155
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009156 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9157 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009158 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009161 kind_self = PyUnicode_KIND(self);
9162 data_self = PyUnicode_DATA(self);
9163 kind_sub = PyUnicode_KIND(substring);
9164 data_sub = PyUnicode_DATA(substring);
9165 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9166
9167 if (direction > 0)
9168 offset = end;
9169 else
9170 offset = start;
9171
9172 if (PyUnicode_READ(kind_self, data_self, offset) ==
9173 PyUnicode_READ(kind_sub, data_sub, 0) &&
9174 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9175 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9176 /* If both are of the same kind, memcmp is sufficient */
9177 if (kind_self == kind_sub) {
9178 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009179 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180 data_sub,
9181 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009182 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 }
9184 /* otherwise we have to compare each character by first accesing it */
9185 else {
9186 /* We do not need to compare 0 and len(substring)-1 because
9187 the if statement above ensured already that they are equal
9188 when we end up here. */
9189 // TODO: honor direction and do a forward or backwards search
9190 for (i = 1; i < end_sub; ++i) {
9191 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9192 PyUnicode_READ(kind_sub, data_sub, i))
9193 return 0;
9194 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009195 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009197 }
9198
9199 return 0;
9200}
9201
Alexander Belopolsky40018472011-02-26 01:02:56 +00009202Py_ssize_t
9203PyUnicode_Tailmatch(PyObject *str,
9204 PyObject *substr,
9205 Py_ssize_t start,
9206 Py_ssize_t end,
9207 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009208{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009209 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009210
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211 str = PyUnicode_FromObject(str);
9212 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009213 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009214 substr = PyUnicode_FromObject(substr);
9215 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 Py_DECREF(str);
9217 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009218 }
Tim Petersced69f82003-09-16 20:30:58 +00009219
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009220 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009222 Py_DECREF(str);
9223 Py_DECREF(substr);
9224 return result;
9225}
9226
Guido van Rossumd57fd912000-03-10 22:53:23 +00009227/* Apply fixfct filter to the Unicode object self and return a
9228 reference to the modified object */
9229
Alexander Belopolsky40018472011-02-26 01:02:56 +00009230static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009231fixup(PyObject *self,
9232 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009234 PyObject *u;
9235 Py_UCS4 maxchar_old, maxchar_new = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009236
Victor Stinner87af4f22011-11-21 23:03:47 +01009237 u = PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009239 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009240 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009242 /* fix functions return the new maximum character in a string,
9243 if the kind of the resulting unicode object does not change,
9244 everything is fine. Otherwise we need to change the string kind
9245 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009246 maxchar_new = fixfct(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 if (maxchar_new == 0)
9248 /* do nothing, keep maxchar_new at 0 which means no changes. */;
9249 else if (maxchar_new <= 127)
9250 maxchar_new = 127;
9251 else if (maxchar_new <= 255)
9252 maxchar_new = 255;
9253 else if (maxchar_new <= 65535)
9254 maxchar_new = 65535;
9255 else
Victor Stinner8faf8212011-12-08 22:14:11 +01009256 maxchar_new = MAX_UNICODE;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257
9258 if (!maxchar_new && PyUnicode_CheckExact(self)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009259 /* fixfct should return TRUE if it modified the buffer. If
9260 FALSE, return a reference to the original buffer instead
9261 (to save space, not time) */
9262 Py_INCREF(self);
9263 Py_DECREF(u);
Victor Stinner7931d9a2011-11-04 00:22:48 +01009264 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 else if (maxchar_new == maxchar_old) {
9267 return u;
9268 }
9269 else {
9270 /* In case the maximum character changed, we need to
9271 convert the string to the new category. */
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009272 PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 if (v == NULL) {
9274 Py_DECREF(u);
9275 return NULL;
9276 }
9277 if (maxchar_new > maxchar_old) {
9278 /* If the maxchar increased so that the kind changed, not all
9279 characters are representable anymore and we need to fix the
9280 string again. This only happens in very few cases. */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009281 copy_characters(v, 0, self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner9310abb2011-10-05 00:59:23 +02009282 maxchar_old = fixfct(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9284 }
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009285 else {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009286 copy_characters(v, 0, u, 0, PyUnicode_GET_LENGTH(self));
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009287 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288
9289 Py_DECREF(u);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009290 assert(_PyUnicode_CheckConsistency(v, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009291 return v;
9292 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293}
9294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009295static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009296fixupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 /* No need to call PyUnicode_READY(self) because this function is only
9299 called as a callback from fixup() which does it already. */
9300 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9301 const int kind = PyUnicode_KIND(self);
9302 void *data = PyUnicode_DATA(self);
9303 int touched = 0;
9304 Py_UCS4 maxchar = 0;
9305 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 for (i = 0; i < len; ++i) {
9308 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9309 const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
9310 if (up != ch) {
9311 if (up > maxchar)
9312 maxchar = up;
9313 PyUnicode_WRITE(kind, data, i, up);
9314 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 else if (ch > maxchar)
9317 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 }
9319
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009320 if (touched)
9321 return maxchar;
9322 else
9323 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009324}
9325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009327fixlower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9330 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9331 const int kind = PyUnicode_KIND(self);
9332 void *data = PyUnicode_DATA(self);
9333 int touched = 0;
9334 Py_UCS4 maxchar = 0;
9335 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009336
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 for(i = 0; i < len; ++i) {
9338 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9339 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9340 if (lo != ch) {
9341 if (lo > maxchar)
9342 maxchar = lo;
9343 PyUnicode_WRITE(kind, data, i, lo);
9344 touched = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 else if (ch > maxchar)
9347 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009348 }
9349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350 if (touched)
9351 return maxchar;
9352 else
9353 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354}
9355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009357fixswapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9360 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9361 const int kind = PyUnicode_KIND(self);
9362 void *data = PyUnicode_DATA(self);
9363 int touched = 0;
9364 Py_UCS4 maxchar = 0;
9365 Py_ssize_t i;
Tim Petersced69f82003-09-16 20:30:58 +00009366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009367 for(i = 0; i < len; ++i) {
9368 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9369 Py_UCS4 nu = 0;
9370
9371 if (Py_UNICODE_ISUPPER(ch))
9372 nu = Py_UNICODE_TOLOWER(ch);
9373 else if (Py_UNICODE_ISLOWER(ch))
9374 nu = Py_UNICODE_TOUPPER(ch);
9375
9376 if (nu != 0) {
9377 if (nu > maxchar)
9378 maxchar = nu;
9379 PyUnicode_WRITE(kind, data, i, nu);
9380 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009382 else if (ch > maxchar)
9383 maxchar = ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384 }
9385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386 if (touched)
9387 return maxchar;
9388 else
9389 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390}
9391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009393fixcapitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009395 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9396 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9397 const int kind = PyUnicode_KIND(self);
9398 void *data = PyUnicode_DATA(self);
9399 int touched = 0;
9400 Py_UCS4 maxchar = 0;
9401 Py_ssize_t i = 0;
9402 Py_UCS4 ch;
Tim Petersced69f82003-09-16 20:30:58 +00009403
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009404 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009405 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406
9407 ch = PyUnicode_READ(kind, data, i);
9408 if (!Py_UNICODE_ISUPPER(ch)) {
9409 maxchar = Py_UNICODE_TOUPPER(ch);
9410 PyUnicode_WRITE(kind, data, i, maxchar);
9411 touched = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009413 ++i;
9414 for(; i < len; ++i) {
9415 ch = PyUnicode_READ(kind, data, i);
9416 if (!Py_UNICODE_ISLOWER(ch)) {
9417 const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
9418 if (lo > maxchar)
9419 maxchar = lo;
9420 PyUnicode_WRITE(kind, data, i, lo);
9421 touched = 1;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009423 else if (ch > maxchar)
9424 maxchar = ch;
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009425 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009426
9427 if (touched)
9428 return maxchar;
9429 else
9430 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431}
9432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02009434fixtitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 /* No need to call PyUnicode_READY(self) because fixup() which does it. */
9437 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9438 const int kind = PyUnicode_KIND(self);
9439 void *data = PyUnicode_DATA(self);
9440 Py_UCS4 maxchar = 0;
9441 Py_ssize_t i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009442 int previous_is_cased;
9443
9444 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009445 if (len == 1) {
9446 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9447 const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
9448 if (ti != ch) {
9449 PyUnicode_WRITE(kind, data, i, ti);
9450 return ti;
Benjamin Peterson29060642009-01-31 22:14:21 +00009451 }
9452 else
9453 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009454 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 for(; i < len; ++i) {
9457 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9458 Py_UCS4 nu;
Tim Petersced69f82003-09-16 20:30:58 +00009459
Benjamin Peterson29060642009-01-31 22:14:21 +00009460 if (previous_is_cased)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 nu = Py_UNICODE_TOLOWER(ch);
Benjamin Peterson29060642009-01-31 22:14:21 +00009462 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009463 nu = Py_UNICODE_TOTITLE(ch);
9464
9465 if (nu > maxchar)
9466 maxchar = nu;
9467 PyUnicode_WRITE(kind, data, i, nu);
Tim Petersced69f82003-09-16 20:30:58 +00009468
Benjamin Peterson29060642009-01-31 22:14:21 +00009469 if (Py_UNICODE_ISLOWER(ch) ||
9470 Py_UNICODE_ISUPPER(ch) ||
9471 Py_UNICODE_ISTITLE(ch))
9472 previous_is_cased = 1;
9473 else
9474 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009476 return maxchar;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009477}
9478
Tim Peters8ce9f162004-08-27 01:49:32 +00009479PyObject *
9480PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009482 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009483 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009485 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009486 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9487 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009488 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009490 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009491 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009492 int use_memcpy;
9493 unsigned char *res_data = NULL, *sep_data = NULL;
9494 PyObject *last_obj;
9495 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496
Tim Peters05eba1f2004-08-27 21:32:02 +00009497 fseq = PySequence_Fast(seq, "");
9498 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009499 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009500 }
9501
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009502 /* NOTE: the following code can't call back into Python code,
9503 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009504 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009505
Tim Peters05eba1f2004-08-27 21:32:02 +00009506 seqlen = PySequence_Fast_GET_SIZE(fseq);
9507 /* If empty sequence, return u"". */
9508 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009509 Py_DECREF(fseq);
9510 Py_INCREF(unicode_empty);
9511 res = unicode_empty;
9512 return res;
Tim Peters05eba1f2004-08-27 21:32:02 +00009513 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009514
Tim Peters05eba1f2004-08-27 21:32:02 +00009515 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009516 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009517 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009518 if (seqlen == 1) {
9519 if (PyUnicode_CheckExact(items[0])) {
9520 res = items[0];
9521 Py_INCREF(res);
9522 Py_DECREF(fseq);
9523 return res;
9524 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009525 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009526 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009527 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009528 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009529 /* Set up sep and seplen */
9530 if (separator == NULL) {
9531 /* fall back to a blank space separator */
9532 sep = PyUnicode_FromOrdinal(' ');
9533 if (!sep)
9534 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009535 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009536 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009537 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009538 else {
9539 if (!PyUnicode_Check(separator)) {
9540 PyErr_Format(PyExc_TypeError,
9541 "separator: expected str instance,"
9542 " %.80s found",
9543 Py_TYPE(separator)->tp_name);
9544 goto onError;
9545 }
9546 if (PyUnicode_READY(separator))
9547 goto onError;
9548 sep = separator;
9549 seplen = PyUnicode_GET_LENGTH(separator);
9550 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9551 /* inc refcount to keep this code path symmetric with the
9552 above case of a blank separator */
9553 Py_INCREF(sep);
9554 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009555 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009556 }
9557
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009558 /* There are at least two things to join, or else we have a subclass
9559 * of str in the sequence.
9560 * Do a pre-pass to figure out the total amount of space we'll
9561 * need (sz), and see whether all argument are strings.
9562 */
9563 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009564#ifdef Py_DEBUG
9565 use_memcpy = 0;
9566#else
9567 use_memcpy = 1;
9568#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009569 for (i = 0; i < seqlen; i++) {
9570 const Py_ssize_t old_sz = sz;
9571 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 if (!PyUnicode_Check(item)) {
9573 PyErr_Format(PyExc_TypeError,
9574 "sequence item %zd: expected str instance,"
9575 " %.80s found",
9576 i, Py_TYPE(item)->tp_name);
9577 goto onError;
9578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579 if (PyUnicode_READY(item) == -1)
9580 goto onError;
9581 sz += PyUnicode_GET_LENGTH(item);
9582 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009583 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009584 if (i != 0)
9585 sz += seplen;
9586 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9587 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009588 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009589 goto onError;
9590 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009591 if (use_memcpy && last_obj != NULL) {
9592 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9593 use_memcpy = 0;
9594 }
9595 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009596 }
Tim Petersced69f82003-09-16 20:30:58 +00009597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009599 if (res == NULL)
9600 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009601
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009602 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009603#ifdef Py_DEBUG
9604 use_memcpy = 0;
9605#else
9606 if (use_memcpy) {
9607 res_data = PyUnicode_1BYTE_DATA(res);
9608 kind = PyUnicode_KIND(res);
9609 if (seplen != 0)
9610 sep_data = PyUnicode_1BYTE_DATA(sep);
9611 }
9612#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009613 for (i = 0, res_offset = 0; i < seqlen; ++i) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009614 Py_ssize_t itemlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009615 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009616 /* Copy item, and maybe the separator. */
Victor Stinner9ce5a832011-10-03 23:36:02 +02009617 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009618 if (use_memcpy) {
9619 Py_MEMCPY(res_data,
9620 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009621 kind * seplen);
9622 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009623 }
9624 else {
9625 copy_characters(res, res_offset, sep, 0, seplen);
9626 res_offset += seplen;
9627 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009629 itemlen = PyUnicode_GET_LENGTH(item);
9630 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009631 if (use_memcpy) {
9632 Py_MEMCPY(res_data,
9633 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009634 kind * itemlen);
9635 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009636 }
9637 else {
9638 copy_characters(res, res_offset, item, 0, itemlen);
9639 res_offset += itemlen;
9640 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009641 }
Tim Peters05eba1f2004-08-27 21:32:02 +00009642 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009643 if (use_memcpy)
9644 assert(res_data == PyUnicode_1BYTE_DATA(res)
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009645 + kind * PyUnicode_GET_LENGTH(res));
Victor Stinnerdd077322011-10-07 17:02:31 +02009646 else
9647 assert(res_offset == PyUnicode_GET_LENGTH(res));
Tim Peters8ce9f162004-08-27 01:49:32 +00009648
Tim Peters05eba1f2004-08-27 21:32:02 +00009649 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009651 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009653
Benjamin Peterson29060642009-01-31 22:14:21 +00009654 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009655 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009657 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009658 return NULL;
9659}
9660
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661#define FILL(kind, data, value, start, length) \
9662 do { \
9663 Py_ssize_t i_ = 0; \
9664 assert(kind != PyUnicode_WCHAR_KIND); \
9665 switch ((kind)) { \
9666 case PyUnicode_1BYTE_KIND: { \
9667 unsigned char * to_ = (unsigned char *)((data)) + (start); \
9668 memset(to_, (unsigned char)value, length); \
9669 break; \
9670 } \
9671 case PyUnicode_2BYTE_KIND: { \
9672 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9673 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9674 break; \
9675 } \
9676 default: { \
9677 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9678 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9679 break; \
9680 } \
9681 } \
9682 } while (0)
9683
Victor Stinner9310abb2011-10-05 00:59:23 +02009684static PyObject *
9685pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009686 Py_ssize_t left,
9687 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009688 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 PyObject *u;
9691 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009692 int kind;
9693 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009694
9695 if (left < 0)
9696 left = 0;
9697 if (right < 0)
9698 right = 0;
9699
Tim Peters7a29bd52001-09-12 03:03:31 +00009700 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701 Py_INCREF(self);
9702 return self;
9703 }
9704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9706 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009707 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9708 return NULL;
9709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009710 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9711 if (fill > maxchar)
9712 maxchar = fill;
9713 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009714 if (!u)
9715 return NULL;
9716
9717 kind = PyUnicode_KIND(u);
9718 data = PyUnicode_DATA(u);
9719 if (left)
9720 FILL(kind, data, fill, 0, left);
9721 if (right)
9722 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009723 copy_characters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009724 assert(_PyUnicode_CheckConsistency(u, 1));
9725 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009726}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727#undef FILL
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728
Alexander Belopolsky40018472011-02-26 01:02:56 +00009729PyObject *
9730PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009731{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733
9734 string = PyUnicode_FromObject(string);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 if (string == NULL || PyUnicode_READY(string) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 switch(PyUnicode_KIND(string)) {
9739 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009740 if (PyUnicode_IS_ASCII(string))
9741 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009742 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009743 PyUnicode_GET_LENGTH(string), keepends);
9744 else
9745 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009746 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009747 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009748 break;
9749 case PyUnicode_2BYTE_KIND:
9750 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009751 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 PyUnicode_GET_LENGTH(string), keepends);
9753 break;
9754 case PyUnicode_4BYTE_KIND:
9755 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009756 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009757 PyUnicode_GET_LENGTH(string), keepends);
9758 break;
9759 default:
9760 assert(0);
9761 list = 0;
9762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763 Py_DECREF(string);
9764 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009765}
9766
Alexander Belopolsky40018472011-02-26 01:02:56 +00009767static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009768split(PyObject *self,
9769 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009770 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009772 int kind1, kind2, kind;
9773 void *buf1, *buf2;
9774 Py_ssize_t len1, len2;
9775 PyObject* out;
9776
Guido van Rossumd57fd912000-03-10 22:53:23 +00009777 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009778 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009780 if (PyUnicode_READY(self) == -1)
9781 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 if (substring == NULL)
9784 switch(PyUnicode_KIND(self)) {
9785 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009786 if (PyUnicode_IS_ASCII(self))
9787 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009788 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009789 PyUnicode_GET_LENGTH(self), maxcount
9790 );
9791 else
9792 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009793 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009794 PyUnicode_GET_LENGTH(self), maxcount
9795 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 case PyUnicode_2BYTE_KIND:
9797 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009798 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009799 PyUnicode_GET_LENGTH(self), maxcount
9800 );
9801 case PyUnicode_4BYTE_KIND:
9802 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009803 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009804 PyUnicode_GET_LENGTH(self), maxcount
9805 );
9806 default:
9807 assert(0);
9808 return NULL;
9809 }
9810
9811 if (PyUnicode_READY(substring) == -1)
9812 return NULL;
9813
9814 kind1 = PyUnicode_KIND(self);
9815 kind2 = PyUnicode_KIND(substring);
9816 kind = kind1 > kind2 ? kind1 : kind2;
9817 buf1 = PyUnicode_DATA(self);
9818 buf2 = PyUnicode_DATA(substring);
9819 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009820 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009821 if (!buf1)
9822 return NULL;
9823 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009824 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009825 if (!buf2) {
9826 if (kind1 != kind) PyMem_Free(buf1);
9827 return NULL;
9828 }
9829 len1 = PyUnicode_GET_LENGTH(self);
9830 len2 = PyUnicode_GET_LENGTH(substring);
9831
9832 switch(kind) {
9833 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009834 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9835 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009836 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009837 else
9838 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009839 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009840 break;
9841 case PyUnicode_2BYTE_KIND:
9842 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009843 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009844 break;
9845 case PyUnicode_4BYTE_KIND:
9846 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009847 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009848 break;
9849 default:
9850 out = NULL;
9851 }
9852 if (kind1 != kind)
9853 PyMem_Free(buf1);
9854 if (kind2 != kind)
9855 PyMem_Free(buf2);
9856 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857}
9858
Alexander Belopolsky40018472011-02-26 01:02:56 +00009859static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009860rsplit(PyObject *self,
9861 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009862 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009863{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009864 int kind1, kind2, kind;
9865 void *buf1, *buf2;
9866 Py_ssize_t len1, len2;
9867 PyObject* out;
9868
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009869 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009870 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009872 if (PyUnicode_READY(self) == -1)
9873 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 if (substring == NULL)
9876 switch(PyUnicode_KIND(self)) {
9877 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009878 if (PyUnicode_IS_ASCII(self))
9879 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009880 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009881 PyUnicode_GET_LENGTH(self), maxcount
9882 );
9883 else
9884 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009885 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009886 PyUnicode_GET_LENGTH(self), maxcount
9887 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 case PyUnicode_2BYTE_KIND:
9889 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009890 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 PyUnicode_GET_LENGTH(self), maxcount
9892 );
9893 case PyUnicode_4BYTE_KIND:
9894 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009895 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 PyUnicode_GET_LENGTH(self), maxcount
9897 );
9898 default:
9899 assert(0);
9900 return NULL;
9901 }
9902
9903 if (PyUnicode_READY(substring) == -1)
9904 return NULL;
9905
9906 kind1 = PyUnicode_KIND(self);
9907 kind2 = PyUnicode_KIND(substring);
9908 kind = kind1 > kind2 ? kind1 : kind2;
9909 buf1 = PyUnicode_DATA(self);
9910 buf2 = PyUnicode_DATA(substring);
9911 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009912 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 if (!buf1)
9914 return NULL;
9915 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +01009916 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 if (!buf2) {
9918 if (kind1 != kind) PyMem_Free(buf1);
9919 return NULL;
9920 }
9921 len1 = PyUnicode_GET_LENGTH(self);
9922 len2 = PyUnicode_GET_LENGTH(substring);
9923
9924 switch(kind) {
9925 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009926 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9927 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009928 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009929 else
9930 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009931 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932 break;
9933 case PyUnicode_2BYTE_KIND:
9934 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009935 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 break;
9937 case PyUnicode_4BYTE_KIND:
9938 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009939 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 break;
9941 default:
9942 out = NULL;
9943 }
9944 if (kind1 != kind)
9945 PyMem_Free(buf1);
9946 if (kind2 != kind)
9947 PyMem_Free(buf2);
9948 return out;
9949}
9950
9951static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009952anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9953 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009954{
9955 switch(kind) {
9956 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009957 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9958 return asciilib_find(buf1, len1, buf2, len2, offset);
9959 else
9960 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 case PyUnicode_2BYTE_KIND:
9962 return ucs2lib_find(buf1, len1, buf2, len2, offset);
9963 case PyUnicode_4BYTE_KIND:
9964 return ucs4lib_find(buf1, len1, buf2, len2, offset);
9965 }
9966 assert(0);
9967 return -1;
9968}
9969
9970static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9972 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973{
9974 switch(kind) {
9975 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009976 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
9977 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
9978 else
9979 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 case PyUnicode_2BYTE_KIND:
9981 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9982 case PyUnicode_4BYTE_KIND:
9983 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9984 }
9985 assert(0);
9986 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00009987}
9988
Alexander Belopolsky40018472011-02-26 01:02:56 +00009989static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990replace(PyObject *self, PyObject *str1,
9991 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009992{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009993 PyObject *u;
9994 char *sbuf = PyUnicode_DATA(self);
9995 char *buf1 = PyUnicode_DATA(str1);
9996 char *buf2 = PyUnicode_DATA(str2);
9997 int srelease = 0, release1 = 0, release2 = 0;
9998 int skind = PyUnicode_KIND(self);
9999 int kind1 = PyUnicode_KIND(str1);
10000 int kind2 = PyUnicode_KIND(str2);
10001 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10002 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10003 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010004 int mayshrink;
10005 Py_UCS4 maxchar, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006
10007 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010008 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010010 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010011
Victor Stinner59de0ee2011-10-07 10:01:28 +020010012 if (str1 == str2)
10013 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 if (skind < kind1)
10015 /* substring too wide to be present */
10016 goto nothing;
10017
Victor Stinner49a0a212011-10-12 23:46:10 +020010018 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10019 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10020 /* Replacing str1 with str2 may cause a maxchar reduction in the
10021 result string. */
10022 mayshrink = (maxchar_str2 < maxchar);
10023 maxchar = Py_MAX(maxchar, maxchar_str2);
10024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 if (len1 == len2) {
Antoine Pitroucbfdee32010-01-13 08:58:08 +000010026 Py_ssize_t i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010027 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010029 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010031 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010032 Py_UCS4 u1, u2;
10033 int rkind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 u1 = PyUnicode_READ_CHAR(str1, 0);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010035 if (findchar(sbuf, PyUnicode_KIND(self),
10036 slen, u1, 1) < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010037 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 u2 = PyUnicode_READ_CHAR(str2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010040 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 goto error;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010042 copy_characters(u, 0, self, 0, slen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010043 rkind = PyUnicode_KIND(u);
10044 for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
10045 if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010046 if (--maxcount < 0)
10047 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010049 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010050 }
10051 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 int rkind = skind;
10053 char *res;
Victor Stinner25a4b292011-10-06 12:31:55 +020010054
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 if (kind1 < rkind) {
10056 /* widen substring */
10057 buf1 = _PyUnicode_AsKind(str1, rkind);
10058 if (!buf1) goto error;
10059 release1 = 1;
10060 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010061 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010062 if (i < 0)
10063 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064 if (rkind > kind2) {
10065 /* widen replacement */
10066 buf2 = _PyUnicode_AsKind(str2, rkind);
10067 if (!buf2) goto error;
10068 release2 = 1;
10069 }
10070 else if (rkind < kind2) {
10071 /* widen self and buf1 */
10072 rkind = kind2;
10073 if (release1) PyMem_Free(buf1);
10074 sbuf = _PyUnicode_AsKind(self, rkind);
10075 if (!sbuf) goto error;
10076 srelease = 1;
10077 buf1 = _PyUnicode_AsKind(str1, rkind);
10078 if (!buf1) goto error;
10079 release1 = 1;
10080 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010081 u = PyUnicode_New(slen, maxchar);
10082 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010084 assert(PyUnicode_KIND(u) == rkind);
10085 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010086
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010087 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010088 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010089 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010091 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010092 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010093
10094 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010095 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010096 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010097 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010098 if (i == -1)
10099 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010100 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010102 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010104 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010105 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010106 }
10107 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 Py_ssize_t n, i, j, ires;
10109 Py_ssize_t product, new_size;
10110 int rkind = skind;
10111 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010114 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010115 buf1 = _PyUnicode_AsKind(str1, rkind);
10116 if (!buf1) goto error;
10117 release1 = 1;
10118 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010119 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010120 if (n == 0)
10121 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010123 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 buf2 = _PyUnicode_AsKind(str2, rkind);
10125 if (!buf2) goto error;
10126 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010127 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010129 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010130 rkind = kind2;
10131 sbuf = _PyUnicode_AsKind(self, rkind);
10132 if (!sbuf) goto error;
10133 srelease = 1;
10134 if (release1) PyMem_Free(buf1);
10135 buf1 = _PyUnicode_AsKind(str1, rkind);
10136 if (!buf1) goto error;
10137 release1 = 1;
10138 }
10139 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10140 PyUnicode_GET_LENGTH(str1))); */
10141 product = n * (len2-len1);
10142 if ((product / (len2-len1)) != n) {
10143 PyErr_SetString(PyExc_OverflowError,
10144 "replace string is too long");
10145 goto error;
10146 }
10147 new_size = slen + product;
Victor Stinner49a0a212011-10-12 23:46:10 +020010148 if (new_size == 0) {
10149 Py_INCREF(unicode_empty);
10150 u = unicode_empty;
10151 goto done;
10152 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10154 PyErr_SetString(PyExc_OverflowError,
10155 "replace string is too long");
10156 goto error;
10157 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010158 u = PyUnicode_New(new_size, maxchar);
10159 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010161 assert(PyUnicode_KIND(u) == rkind);
10162 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010163 ires = i = 0;
10164 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010165 while (n-- > 0) {
10166 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010167 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010168 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010169 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010170 if (j == -1)
10171 break;
10172 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010173 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010174 memcpy(res + rkind * ires,
10175 sbuf + rkind * i,
10176 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010178 }
10179 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010181 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010183 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010189 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010190 memcpy(res + rkind * ires,
10191 sbuf + rkind * i,
10192 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010193 }
10194 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010195 /* interleave */
10196 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010197 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010198 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010199 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010200 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010201 if (--n <= 0)
10202 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010203 memcpy(res + rkind * ires,
10204 sbuf + rkind * i,
10205 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 ires++;
10207 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010208 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010209 memcpy(res + rkind * ires,
10210 sbuf + rkind * i,
10211 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010212 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010213 }
10214
10215 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010216 unicode_adjust_maxchar(&u);
10217 if (u == NULL)
10218 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010219 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010220
10221 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 if (srelease)
10223 PyMem_FREE(sbuf);
10224 if (release1)
10225 PyMem_FREE(buf1);
10226 if (release2)
10227 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010228 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010230
Benjamin Peterson29060642009-01-31 22:14:21 +000010231 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010232 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 if (srelease)
10234 PyMem_FREE(sbuf);
10235 if (release1)
10236 PyMem_FREE(buf1);
10237 if (release2)
10238 PyMem_FREE(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010239 if (PyUnicode_CheckExact(self)) {
10240 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010241 return self;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010242 }
Victor Stinner034f6cf2011-09-30 02:26:44 +020010243 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 error:
10245 if (srelease && sbuf)
10246 PyMem_FREE(sbuf);
10247 if (release1 && buf1)
10248 PyMem_FREE(buf1);
10249 if (release2 && buf2)
10250 PyMem_FREE(buf2);
10251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252}
10253
10254/* --- Unicode Object Methods --------------------------------------------- */
10255
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010256PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010257 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258\n\
10259Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010260characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
10262static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010263unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010265 return fixup(self, fixtitle);
10266}
10267
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010268PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010269 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270\n\
10271Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010272have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273
10274static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010275unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010277 return fixup(self, fixcapitalize);
10278}
10279
10280#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010281PyDoc_STRVAR(capwords__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010282 "S.capwords() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010283\n\
10284Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010285normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286
10287static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010288unicode_capwords(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289{
10290 PyObject *list;
10291 PyObject *item;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010292 Py_ssize_t i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294 /* Split into words */
10295 list = split(self, NULL, -1);
10296 if (!list)
10297 return NULL;
10298
10299 /* Capitalize each word */
10300 for (i = 0; i < PyList_GET_SIZE(list); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010301 item = fixup(PyList_GET_ITEM(list, i),
Benjamin Peterson29060642009-01-31 22:14:21 +000010302 fixcapitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010303 if (item == NULL)
10304 goto onError;
10305 Py_DECREF(PyList_GET_ITEM(list, i));
10306 PyList_SET_ITEM(list, i, item);
10307 }
10308
10309 /* Join the words to form a new string */
10310 item = PyUnicode_Join(NULL, list);
10311
Benjamin Peterson29060642009-01-31 22:14:21 +000010312 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010313 Py_DECREF(list);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010314 return item;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315}
10316#endif
10317
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010318/* Argument converter. Coerces to a single unicode character */
10319
10320static int
10321convert_uc(PyObject *obj, void *addr)
10322{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010324 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010325
Benjamin Peterson14339b62009-01-31 16:36:08 +000010326 uniobj = PyUnicode_FromObject(obj);
10327 if (uniobj == NULL) {
10328 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010329 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010330 return 0;
10331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010333 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010334 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010335 Py_DECREF(uniobj);
10336 return 0;
10337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010339 Py_DECREF(uniobj);
10340 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010341}
10342
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010343PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010344 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010345\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010346Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010347done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010348
10349static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010350unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010351{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010352 Py_ssize_t marg, left;
10353 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 Py_UCS4 fillchar = ' ';
10355
Victor Stinnere9a29352011-10-01 02:14:59 +020010356 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010358
Victor Stinnere9a29352011-10-01 02:14:59 +020010359 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010360 return NULL;
10361
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010010364 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 }
10366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 marg = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010368 left = marg / 2 + (marg & width & 1);
10369
Victor Stinner9310abb2011-10-05 00:59:23 +020010370 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010371}
10372
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373/* This function assumes that str1 and str2 are readied by the caller. */
10374
Marc-André Lemburge5034372000-08-08 08:04:29 +000010375static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010376unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 int kind1, kind2;
10379 void *data1, *data2;
10380 Py_ssize_t len1, len2, i;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 kind1 = PyUnicode_KIND(str1);
10383 kind2 = PyUnicode_KIND(str2);
10384 data1 = PyUnicode_DATA(str1);
10385 data2 = PyUnicode_DATA(str2);
10386 len1 = PyUnicode_GET_LENGTH(str1);
10387 len2 = PyUnicode_GET_LENGTH(str2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 for (i = 0; i < len1 && i < len2; ++i) {
10390 Py_UCS4 c1, c2;
10391 c1 = PyUnicode_READ(kind1, data1, i);
10392 c2 = PyUnicode_READ(kind2, data2, i);
Fredrik Lundh45714e92001-06-26 16:39:36 +000010393
10394 if (c1 != c2)
10395 return (c1 < c2) ? -1 : 1;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010396 }
10397
10398 return (len1 < len2) ? -1 : (len1 != len2);
10399}
10400
Alexander Belopolsky40018472011-02-26 01:02:56 +000010401int
10402PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10405 if (PyUnicode_READY(left) == -1 ||
10406 PyUnicode_READY(right) == -1)
10407 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010408 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010410 PyErr_Format(PyExc_TypeError,
10411 "Can't compare %.100s and %.100s",
10412 left->ob_type->tp_name,
10413 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010414 return -1;
10415}
10416
Martin v. Löwis5b222132007-06-10 09:51:05 +000010417int
10418PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10419{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 Py_ssize_t i;
10421 int kind;
10422 void *data;
10423 Py_UCS4 chr;
10424
Victor Stinner910337b2011-10-03 03:20:16 +020010425 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (PyUnicode_READY(uni) == -1)
10427 return -1;
10428 kind = PyUnicode_KIND(uni);
10429 data = PyUnicode_DATA(uni);
Martin v. Löwis5b222132007-06-10 09:51:05 +000010430 /* Compare Unicode string and source character set string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10432 if (chr != str[i])
10433 return (chr < (unsigned char)(str[i])) ? -1 : 1;
Benjamin Peterson8667a9b2010-01-09 21:45:28 +000010434 /* This check keeps Python strings that end in '\0' from comparing equal
10435 to C strings identical up to that point. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (PyUnicode_GET_LENGTH(uni) != i || chr)
Benjamin Peterson29060642009-01-31 22:14:21 +000010437 return 1; /* uni is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010438 if (str[i])
Benjamin Peterson29060642009-01-31 22:14:21 +000010439 return -1; /* str is longer */
Martin v. Löwis5b222132007-06-10 09:51:05 +000010440 return 0;
10441}
10442
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010443
Benjamin Peterson29060642009-01-31 22:14:21 +000010444#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010445 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010446
Alexander Belopolsky40018472011-02-26 01:02:56 +000010447PyObject *
10448PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010449{
10450 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010451
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010452 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10453 PyObject *v;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (PyUnicode_READY(left) == -1 ||
10455 PyUnicode_READY(right) == -1)
10456 return NULL;
10457 if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
10458 PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010459 if (op == Py_EQ) {
10460 Py_INCREF(Py_False);
10461 return Py_False;
10462 }
10463 if (op == Py_NE) {
10464 Py_INCREF(Py_True);
10465 return Py_True;
10466 }
10467 }
10468 if (left == right)
10469 result = 0;
10470 else
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010471 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010472
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010473 /* Convert the return value to a Boolean */
10474 switch (op) {
10475 case Py_EQ:
10476 v = TEST_COND(result == 0);
10477 break;
10478 case Py_NE:
10479 v = TEST_COND(result != 0);
10480 break;
10481 case Py_LE:
10482 v = TEST_COND(result <= 0);
10483 break;
10484 case Py_GE:
10485 v = TEST_COND(result >= 0);
10486 break;
10487 case Py_LT:
10488 v = TEST_COND(result == -1);
10489 break;
10490 case Py_GT:
10491 v = TEST_COND(result == 1);
10492 break;
10493 default:
10494 PyErr_BadArgument();
10495 return NULL;
10496 }
10497 Py_INCREF(v);
10498 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010500
Brian Curtindfc80e32011-08-10 20:28:54 -050010501 Py_RETURN_NOTIMPLEMENTED;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010502}
10503
Alexander Belopolsky40018472011-02-26 01:02:56 +000010504int
10505PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010506{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010507 PyObject *str, *sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 int kind1, kind2, kind;
10509 void *buf1, *buf2;
10510 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010511 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010512
10513 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010514 sub = PyUnicode_FromObject(element);
10515 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010516 PyErr_Format(PyExc_TypeError,
10517 "'in <string>' requires string as left operand, not %s",
10518 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521 if (PyUnicode_READY(sub) == -1)
10522 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010523
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 str = PyUnicode_FromObject(container);
Victor Stinnere9a29352011-10-01 02:14:59 +020010525 if (!str || PyUnicode_READY(str) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010526 Py_DECREF(sub);
10527 return -1;
10528 }
10529
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 kind1 = PyUnicode_KIND(str);
10531 kind2 = PyUnicode_KIND(sub);
10532 kind = kind1 > kind2 ? kind1 : kind2;
10533 buf1 = PyUnicode_DATA(str);
10534 buf2 = PyUnicode_DATA(sub);
10535 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010536 buf1 = _PyUnicode_AsKind(str, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 if (!buf1) {
10538 Py_DECREF(sub);
10539 return -1;
10540 }
10541 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010542 buf2 = _PyUnicode_AsKind(sub, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010543 if (!buf2) {
10544 Py_DECREF(sub);
10545 if (kind1 != kind) PyMem_Free(buf1);
10546 return -1;
10547 }
10548 len1 = PyUnicode_GET_LENGTH(str);
10549 len2 = PyUnicode_GET_LENGTH(sub);
10550
10551 switch(kind) {
10552 case PyUnicode_1BYTE_KIND:
10553 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10554 break;
10555 case PyUnicode_2BYTE_KIND:
10556 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10557 break;
10558 case PyUnicode_4BYTE_KIND:
10559 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10560 break;
10561 default:
10562 result = -1;
10563 assert(0);
10564 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010565
10566 Py_DECREF(str);
10567 Py_DECREF(sub);
10568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (kind1 != kind)
10570 PyMem_Free(buf1);
10571 if (kind2 != kind)
10572 PyMem_Free(buf2);
10573
Guido van Rossum403d68b2000-03-13 15:55:09 +000010574 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010575}
10576
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577/* Concat to string or Unicode object giving a new Unicode object. */
10578
Alexander Belopolsky40018472011-02-26 01:02:56 +000010579PyObject *
10580PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010583 Py_UCS4 maxchar, maxchar2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584
10585 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010588 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592
10593 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010594 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010595 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010598 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010599 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 }
10602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020010604 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10605 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 /* Concat the two Unicode strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 w = PyUnicode_New(
10609 PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10610 maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010612 goto onError;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010613 copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
10614 copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 Py_DECREF(u);
10616 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010617 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621 Py_XDECREF(u);
10622 Py_XDECREF(v);
10623 return NULL;
10624}
10625
Victor Stinnerb0923652011-10-04 01:17:31 +020010626static void
10627unicode_append_inplace(PyObject **p_left, PyObject *right)
10628{
10629 Py_ssize_t left_len, right_len, new_len;
Victor Stinnerb0923652011-10-04 01:17:31 +020010630
10631 assert(PyUnicode_IS_READY(*p_left));
10632 assert(PyUnicode_IS_READY(right));
10633
10634 left_len = PyUnicode_GET_LENGTH(*p_left);
10635 right_len = PyUnicode_GET_LENGTH(right);
10636 if (left_len > PY_SSIZE_T_MAX - right_len) {
10637 PyErr_SetString(PyExc_OverflowError,
10638 "strings are too large to concat");
10639 goto error;
10640 }
10641 new_len = left_len + right_len;
10642
10643 /* Now we own the last reference to 'left', so we can resize it
10644 * in-place.
10645 */
10646 if (unicode_resize(p_left, new_len) != 0) {
10647 /* XXX if _PyUnicode_Resize() fails, 'left' has been
10648 * deallocated so it cannot be put back into
10649 * 'variable'. The MemoryError is raised when there
10650 * is no value in 'variable', which might (very
10651 * remotely) be a cause of incompatibilities.
10652 */
10653 goto error;
10654 }
10655 /* copy 'right' into the newly allocated area of 'left' */
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010656 copy_characters(*p_left, left_len, right, 0, right_len);
10657 _PyUnicode_DIRTY(*p_left);
Victor Stinnerb0923652011-10-04 01:17:31 +020010658 return;
10659
10660error:
10661 Py_DECREF(*p_left);
10662 *p_left = NULL;
10663}
10664
Walter Dörwald1ab83302007-05-18 17:15:44 +000010665void
Victor Stinner23e56682011-10-03 03:54:37 +020010666PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000010667{
Victor Stinner23e56682011-10-03 03:54:37 +020010668 PyObject *left, *res;
10669
10670 if (p_left == NULL) {
10671 if (!PyErr_Occurred())
10672 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000010673 return;
10674 }
Victor Stinner23e56682011-10-03 03:54:37 +020010675 left = *p_left;
10676 if (right == NULL || !PyUnicode_Check(left)) {
10677 if (!PyErr_Occurred())
10678 PyErr_BadInternalCall();
10679 goto error;
10680 }
10681
Victor Stinnere1335c72011-10-04 20:53:03 +020010682 if (PyUnicode_READY(left))
10683 goto error;
10684 if (PyUnicode_READY(right))
10685 goto error;
10686
Victor Stinner23e56682011-10-03 03:54:37 +020010687 if (PyUnicode_CheckExact(left) && left != unicode_empty
10688 && PyUnicode_CheckExact(right) && right != unicode_empty
10689 && unicode_resizable(left)
10690 && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10691 || _PyUnicode_WSTR(left) != NULL))
10692 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010693 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10694 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020010695 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020010696 not so different than duplicating the string. */
10697 if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
Victor Stinner23e56682011-10-03 03:54:37 +020010698 {
Victor Stinnerb0923652011-10-04 01:17:31 +020010699 unicode_append_inplace(p_left, right);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010700 assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020010701 return;
10702 }
10703 }
10704
10705 res = PyUnicode_Concat(left, right);
10706 if (res == NULL)
10707 goto error;
10708 Py_DECREF(left);
10709 *p_left = res;
10710 return;
10711
10712error:
10713 Py_DECREF(*p_left);
10714 *p_left = NULL;
Walter Dörwald1ab83302007-05-18 17:15:44 +000010715}
10716
10717void
10718PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10719{
Benjamin Peterson14339b62009-01-31 16:36:08 +000010720 PyUnicode_Append(pleft, right);
10721 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000010722}
10723
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010724PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010726\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010728string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010729interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010730
10731static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010732unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010733{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010734 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010735 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010736 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010737 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 int kind1, kind2, kind;
10739 void *buf1, *buf2;
10740 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
Jesus Ceaac451502011-04-20 17:09:23 +020010742 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10743 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000010744 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000010745
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 kind1 = PyUnicode_KIND(self);
10747 kind2 = PyUnicode_KIND(substring);
10748 kind = kind1 > kind2 ? kind1 : kind2;
10749 buf1 = PyUnicode_DATA(self);
10750 buf2 = PyUnicode_DATA(substring);
10751 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010752 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 if (!buf1) {
10754 Py_DECREF(substring);
10755 return NULL;
10756 }
10757 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010758 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 if (!buf2) {
10760 Py_DECREF(substring);
10761 if (kind1 != kind) PyMem_Free(buf1);
10762 return NULL;
10763 }
10764 len1 = PyUnicode_GET_LENGTH(self);
10765 len2 = PyUnicode_GET_LENGTH(substring);
10766
10767 ADJUST_INDICES(start, end, len1);
10768 switch(kind) {
10769 case PyUnicode_1BYTE_KIND:
10770 iresult = ucs1lib_count(
10771 ((Py_UCS1*)buf1) + start, end - start,
10772 buf2, len2, PY_SSIZE_T_MAX
10773 );
10774 break;
10775 case PyUnicode_2BYTE_KIND:
10776 iresult = ucs2lib_count(
10777 ((Py_UCS2*)buf1) + start, end - start,
10778 buf2, len2, PY_SSIZE_T_MAX
10779 );
10780 break;
10781 case PyUnicode_4BYTE_KIND:
10782 iresult = ucs4lib_count(
10783 ((Py_UCS4*)buf1) + start, end - start,
10784 buf2, len2, PY_SSIZE_T_MAX
10785 );
10786 break;
10787 default:
10788 assert(0); iresult = 0;
10789 }
10790
10791 result = PyLong_FromSsize_t(iresult);
10792
10793 if (kind1 != kind)
10794 PyMem_Free(buf1);
10795 if (kind2 != kind)
10796 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010797
10798 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010799
Guido van Rossumd57fd912000-03-10 22:53:23 +000010800 return result;
10801}
10802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010803PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000010804 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010805\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000010806Encode S using the codec registered for encoding. Default encoding\n\
10807is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000010808handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000010809a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10810'xmlcharrefreplace' as well as any other name registered with\n\
10811codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010812
10813static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010814unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010815{
Benjamin Peterson308d6372009-09-18 21:42:35 +000010816 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000010817 char *encoding = NULL;
10818 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000010819
Benjamin Peterson308d6372009-09-18 21:42:35 +000010820 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10821 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010822 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010823 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000010824}
10825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010826PyDoc_STRVAR(expandtabs__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010827 "S.expandtabs([tabsize]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828\n\
10829Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010830If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010831
10832static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010833unicode_expandtabs(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834{
Antoine Pitroue71d5742011-10-04 15:55:09 +020010835 Py_ssize_t i, j, line_pos, src_len, incr;
10836 Py_UCS4 ch;
10837 PyObject *u;
10838 void *src_data, *dest_data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010839 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010840 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010841 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
10843 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000010844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845
Antoine Pitrou22425222011-10-04 19:10:51 +020010846 if (PyUnicode_READY(self) == -1)
10847 return NULL;
10848
Thomas Wouters7e474022000-07-16 12:04:32 +000010849 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010850 src_len = PyUnicode_GET_LENGTH(self);
10851 i = j = line_pos = 0;
10852 kind = PyUnicode_KIND(self);
10853 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020010854 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010855 for (; i < src_len; i++) {
10856 ch = PyUnicode_READ(kind, src_data, i);
10857 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020010858 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000010859 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010860 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000010861 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010862 goto overflow;
10863 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000010864 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010865 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010866 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000010868 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020010869 goto overflow;
10870 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010871 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010872 if (ch == '\n' || ch == '\r')
10873 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010874 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010875 }
Antoine Pitroue19aa382011-10-04 16:04:01 +020010876 if (!found && PyUnicode_CheckExact(self)) {
Victor Stinner7931d9a2011-11-04 00:22:48 +010010877 Py_INCREF(self);
10878 return self;
Antoine Pitroue19aa382011-10-04 16:04:01 +020010879 }
Guido van Rossumcd16bf62007-06-13 18:07:49 +000010880
Guido van Rossumd57fd912000-03-10 22:53:23 +000010881 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020010882 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883 if (!u)
10884 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010885 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010886
Antoine Pitroue71d5742011-10-04 15:55:09 +020010887 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010888
Antoine Pitroue71d5742011-10-04 15:55:09 +020010889 for (; i < src_len; i++) {
10890 ch = PyUnicode_READ(kind, src_data, i);
10891 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000010892 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010893 incr = tabsize - (line_pos % tabsize);
10894 line_pos += incr;
10895 while (incr--) {
10896 PyUnicode_WRITE(kind, dest_data, j, ' ');
10897 j++;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010898 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010899 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000010900 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020010902 line_pos++;
10903 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010904 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020010905 if (ch == '\n' || ch == '\r')
10906 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020010908 }
10909 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010010910 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010911
Antoine Pitroue71d5742011-10-04 15:55:09 +020010912 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000010913 PyErr_SetString(PyExc_OverflowError, "new string is too long");
10914 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010915}
10916
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010917PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010918 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919\n\
10920Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080010921such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010922arguments start and end are interpreted as in slice notation.\n\
10923\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010924Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
10926static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010927unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010928{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010929 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000010930 Py_ssize_t start;
10931 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933
Jesus Ceaac451502011-04-20 17:09:23 +020010934 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10935 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938 if (PyUnicode_READY(self) == -1)
10939 return NULL;
10940 if (PyUnicode_READY(substring) == -1)
10941 return NULL;
10942
Victor Stinner7931d9a2011-11-04 00:22:48 +010010943 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010944
10945 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010946
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010947 if (result == -2)
10948 return NULL;
10949
Christian Heimes217cfd12007-12-02 14:31:20 +000010950 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010951}
10952
10953static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010954unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955{
Victor Stinner2fe5ced2011-10-02 00:25:40 +020010956 Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10957 if (ch == (Py_UCS4)-1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 return PyUnicode_FromOrdinal(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010960}
10961
Guido van Rossumc2504932007-09-18 19:42:40 +000010962/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010010963 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000010964static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010965unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966{
Guido van Rossumc2504932007-09-18 19:42:40 +000010967 Py_ssize_t len;
Mark Dickinson57e683e2011-09-24 18:18:40 +010010968 Py_uhash_t x;
Guido van Rossumc2504932007-09-18 19:42:40 +000010969
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 if (_PyUnicode_HASH(self) != -1)
10971 return _PyUnicode_HASH(self);
10972 if (PyUnicode_READY(self) == -1)
10973 return -1;
10974 len = PyUnicode_GET_LENGTH(self);
10975
10976 /* The hash function as a macro, gets expanded three times below. */
10977#define HASH(P) \
10978 x = (Py_uhash_t)*P << 7; \
10979 while (--len >= 0) \
10980 x = (1000003*x) ^ (Py_uhash_t)*P++;
10981
10982 switch (PyUnicode_KIND(self)) {
10983 case PyUnicode_1BYTE_KIND: {
10984 const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10985 HASH(c);
10986 break;
10987 }
10988 case PyUnicode_2BYTE_KIND: {
10989 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10990 HASH(s);
10991 break;
10992 }
10993 default: {
10994 Py_UCS4 *l;
10995 assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10996 "Impossible switch case in unicode_hash");
10997 l = PyUnicode_4BYTE_DATA(self);
10998 HASH(l);
10999 break;
11000 }
11001 }
11002 x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
11003
Guido van Rossumc2504932007-09-18 19:42:40 +000011004 if (x == -1)
11005 x = -2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011007 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011008}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009#undef HASH
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011011PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011012 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011014Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015
11016static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011019 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011020 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011021 Py_ssize_t start;
11022 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023
Jesus Ceaac451502011-04-20 17:09:23 +020011024 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11025 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 if (PyUnicode_READY(self) == -1)
11029 return NULL;
11030 if (PyUnicode_READY(substring) == -1)
11031 return NULL;
11032
Victor Stinner7931d9a2011-11-04 00:22:48 +010011033 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
11035 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 if (result == -2)
11038 return NULL;
11039
Guido van Rossumd57fd912000-03-10 22:53:23 +000011040 if (result < 0) {
11041 PyErr_SetString(PyExc_ValueError, "substring not found");
11042 return NULL;
11043 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011044
Christian Heimes217cfd12007-12-02 14:31:20 +000011045 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011046}
11047
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011048PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011049 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011050\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011051Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011052at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011053
11054static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011055unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011056{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011057 Py_ssize_t i, length;
11058 int kind;
11059 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011060 int cased;
11061
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (PyUnicode_READY(self) == -1)
11063 return NULL;
11064 length = PyUnicode_GET_LENGTH(self);
11065 kind = PyUnicode_KIND(self);
11066 data = PyUnicode_DATA(self);
11067
Guido van Rossumd57fd912000-03-10 22:53:23 +000011068 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (length == 1)
11070 return PyBool_FromLong(
11071 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011073 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011074 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011075 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011076
Guido van Rossumd57fd912000-03-10 22:53:23 +000011077 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 for (i = 0; i < length; i++) {
11079 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011080
Benjamin Peterson29060642009-01-31 22:14:21 +000011081 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11082 return PyBool_FromLong(0);
11083 else if (!cased && Py_UNICODE_ISLOWER(ch))
11084 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011085 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011086 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011087}
11088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011089PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011090 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011091\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011092Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011093at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011094
11095static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011096unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011098 Py_ssize_t i, length;
11099 int kind;
11100 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011101 int cased;
11102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011103 if (PyUnicode_READY(self) == -1)
11104 return NULL;
11105 length = PyUnicode_GET_LENGTH(self);
11106 kind = PyUnicode_KIND(self);
11107 data = PyUnicode_DATA(self);
11108
Guido van Rossumd57fd912000-03-10 22:53:23 +000011109 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 if (length == 1)
11111 return PyBool_FromLong(
11112 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011113
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011114 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011116 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011117
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 for (i = 0; i < length; i++) {
11120 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011121
Benjamin Peterson29060642009-01-31 22:14:21 +000011122 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11123 return PyBool_FromLong(0);
11124 else if (!cased && Py_UNICODE_ISUPPER(ch))
11125 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011127 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128}
11129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011130PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011133Return True if S is a titlecased string and there is at least one\n\
11134character in S, i.e. upper- and titlecase characters may only\n\
11135follow uncased characters and lowercase characters only cased ones.\n\
11136Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
11138static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011139unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011141 Py_ssize_t i, length;
11142 int kind;
11143 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011144 int cased, previous_is_cased;
11145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 if (PyUnicode_READY(self) == -1)
11147 return NULL;
11148 length = PyUnicode_GET_LENGTH(self);
11149 kind = PyUnicode_KIND(self);
11150 data = PyUnicode_DATA(self);
11151
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011153 if (length == 1) {
11154 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11155 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11156 (Py_UNICODE_ISUPPER(ch) != 0));
11157 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011158
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011159 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011161 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011162
Guido van Rossumd57fd912000-03-10 22:53:23 +000011163 cased = 0;
11164 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 for (i = 0; i < length; i++) {
11166 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011167
Benjamin Peterson29060642009-01-31 22:14:21 +000011168 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11169 if (previous_is_cased)
11170 return PyBool_FromLong(0);
11171 previous_is_cased = 1;
11172 cased = 1;
11173 }
11174 else if (Py_UNICODE_ISLOWER(ch)) {
11175 if (!previous_is_cased)
11176 return PyBool_FromLong(0);
11177 previous_is_cased = 1;
11178 cased = 1;
11179 }
11180 else
11181 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011183 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011184}
11185
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011186PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011187 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011188\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011189Return True if all characters in S are whitespace\n\
11190and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011193unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 Py_ssize_t i, length;
11196 int kind;
11197 void *data;
11198
11199 if (PyUnicode_READY(self) == -1)
11200 return NULL;
11201 length = PyUnicode_GET_LENGTH(self);
11202 kind = PyUnicode_KIND(self);
11203 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011204
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011206 if (length == 1)
11207 return PyBool_FromLong(
11208 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011210 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011212 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011214 for (i = 0; i < length; i++) {
11215 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011216 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011217 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011219 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220}
11221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011222PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011223 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011224\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011225Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011227
11228static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 Py_ssize_t i, length;
11232 int kind;
11233 void *data;
11234
11235 if (PyUnicode_READY(self) == -1)
11236 return NULL;
11237 length = PyUnicode_GET_LENGTH(self);
11238 kind = PyUnicode_KIND(self);
11239 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011240
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011241 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 if (length == 1)
11243 return PyBool_FromLong(
11244 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011245
11246 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011247 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011248 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 for (i = 0; i < length; i++) {
11251 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011252 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011253 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011254 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011255}
11256
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011257PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011259\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011260Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011261and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011262
11263static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011264unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011266 int kind;
11267 void *data;
11268 Py_ssize_t len, i;
11269
11270 if (PyUnicode_READY(self) == -1)
11271 return NULL;
11272
11273 kind = PyUnicode_KIND(self);
11274 data = PyUnicode_DATA(self);
11275 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011276
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011277 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011278 if (len == 1) {
11279 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11280 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11281 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011282
11283 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011284 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011285 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011287 for (i = 0; i < len; i++) {
11288 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011289 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011290 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011291 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011292 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011293}
11294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011295PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011296 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011297\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011298Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011299False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300
11301static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011302unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304 Py_ssize_t i, length;
11305 int kind;
11306 void *data;
11307
11308 if (PyUnicode_READY(self) == -1)
11309 return NULL;
11310 length = PyUnicode_GET_LENGTH(self);
11311 kind = PyUnicode_KIND(self);
11312 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315 if (length == 1)
11316 return PyBool_FromLong(
11317 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011319 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011320 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011321 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011323 for (i = 0; i < length; i++) {
11324 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011325 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011327 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011328}
11329
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011330PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011331 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011333Return True if all characters in S are digits\n\
11334and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
11336static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011337unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 Py_ssize_t i, length;
11340 int kind;
11341 void *data;
11342
11343 if (PyUnicode_READY(self) == -1)
11344 return NULL;
11345 length = PyUnicode_GET_LENGTH(self);
11346 kind = PyUnicode_KIND(self);
11347 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348
Guido van Rossumd57fd912000-03-10 22:53:23 +000011349 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 if (length == 1) {
11351 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11352 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011355 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011357 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 for (i = 0; i < length; i++) {
11360 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011361 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011363 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011364}
11365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011366PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011367 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011369Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011370False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371
11372static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011373unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011375 Py_ssize_t i, length;
11376 int kind;
11377 void *data;
11378
11379 if (PyUnicode_READY(self) == -1)
11380 return NULL;
11381 length = PyUnicode_GET_LENGTH(self);
11382 kind = PyUnicode_KIND(self);
11383 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384
Guido van Rossumd57fd912000-03-10 22:53:23 +000011385 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011386 if (length == 1)
11387 return PyBool_FromLong(
11388 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011389
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011390 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011392 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394 for (i = 0; i < length; i++) {
11395 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011396 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011398 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399}
11400
Martin v. Löwis47383402007-08-15 07:32:56 +000011401int
11402PyUnicode_IsIdentifier(PyObject *self)
11403{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011404 int kind;
11405 void *data;
11406 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011407 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011408
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011409 if (PyUnicode_READY(self) == -1) {
11410 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011411 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412 }
11413
11414 /* Special case for empty strings */
11415 if (PyUnicode_GET_LENGTH(self) == 0)
11416 return 0;
11417 kind = PyUnicode_KIND(self);
11418 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011419
11420 /* PEP 3131 says that the first character must be in
11421 XID_Start and subsequent characters in XID_Continue,
11422 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011423 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011424 letters, digits, underscore). However, given the current
11425 definition of XID_Start and XID_Continue, it is sufficient
11426 to check just for these, except that _ must be allowed
11427 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011429 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011430 return 0;
11431
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011432 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011434 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011435 return 1;
11436}
11437
11438PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011439 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011440\n\
11441Return True if S is a valid identifier according\n\
11442to the language definition.");
11443
11444static PyObject*
11445unicode_isidentifier(PyObject *self)
11446{
11447 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11448}
11449
Georg Brandl559e5d72008-06-11 18:37:52 +000011450PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011452\n\
11453Return True if all characters in S are considered\n\
11454printable in repr() or S is empty, False otherwise.");
11455
11456static PyObject*
11457unicode_isprintable(PyObject *self)
11458{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011459 Py_ssize_t i, length;
11460 int kind;
11461 void *data;
11462
11463 if (PyUnicode_READY(self) == -1)
11464 return NULL;
11465 length = PyUnicode_GET_LENGTH(self);
11466 kind = PyUnicode_KIND(self);
11467 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011468
11469 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011470 if (length == 1)
11471 return PyBool_FromLong(
11472 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011473
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 for (i = 0; i < length; i++) {
11475 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011476 Py_RETURN_FALSE;
11477 }
11478 }
11479 Py_RETURN_TRUE;
11480}
11481
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011482PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011483 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484\n\
11485Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011486iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487
11488static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011489unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011491 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492}
11493
Martin v. Löwis18e16552006-02-15 17:27:45 +000011494static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011495unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 if (PyUnicode_READY(self) == -1)
11498 return -1;
11499 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500}
11501
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011502PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011503 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011505Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011506done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011507
11508static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011509unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011511 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011512 Py_UCS4 fillchar = ' ';
11513
11514 if (PyUnicode_READY(self) == -1)
11515 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011516
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011517 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 return NULL;
11519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011520 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011522 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523 }
11524
Victor Stinner7931d9a2011-11-04 00:22:48 +010011525 return pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526}
11527
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011528PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011529 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011531Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
11533static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011534unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535{
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536 return fixup(self, fixlower);
11537}
11538
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011539#define LEFTSTRIP 0
11540#define RIGHTSTRIP 1
11541#define BOTHSTRIP 2
11542
11543/* Arrays indexed by above */
11544static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11545
11546#define STRIPNAME(i) (stripformat[i]+3)
11547
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011548/* externally visible for str.strip(unicode) */
11549PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011550_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 void *data;
11553 int kind;
11554 Py_ssize_t i, j, len;
11555 BLOOM_MASK sepmask;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011557 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11558 return NULL;
11559
11560 kind = PyUnicode_KIND(self);
11561 data = PyUnicode_DATA(self);
11562 len = PyUnicode_GET_LENGTH(self);
11563 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11564 PyUnicode_DATA(sepobj),
11565 PyUnicode_GET_LENGTH(sepobj));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011566
Benjamin Peterson14339b62009-01-31 16:36:08 +000011567 i = 0;
11568 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011569 while (i < len &&
11570 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011571 i++;
11572 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011573 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011574
Benjamin Peterson14339b62009-01-31 16:36:08 +000011575 j = len;
11576 if (striptype != LEFTSTRIP) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 do {
11578 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011579 } while (j >= i &&
11580 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011582 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011583
Victor Stinner7931d9a2011-11-04 00:22:48 +010011584 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011585}
11586
11587PyObject*
11588PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11589{
11590 unsigned char *data;
11591 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020011592 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011593
Victor Stinnerde636f32011-10-01 03:55:54 +020011594 if (PyUnicode_READY(self) == -1)
11595 return NULL;
11596
11597 end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11598
Victor Stinner12bab6d2011-10-01 01:53:49 +020011599 if (start == 0 && end == PyUnicode_GET_LENGTH(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011601 if (PyUnicode_CheckExact(self)) {
11602 Py_INCREF(self);
11603 return self;
11604 }
11605 else
11606 return PyUnicode_Copy(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607 }
11608
Victor Stinner12bab6d2011-10-01 01:53:49 +020011609 length = end - start;
11610 if (length == 1)
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011611 return unicode_getitem(self, start);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612
Victor Stinnerde636f32011-10-01 03:55:54 +020011613 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020011614 PyErr_SetString(PyExc_IndexError, "string index out of range");
11615 return NULL;
11616 }
11617
Victor Stinnerb9275c12011-10-05 14:01:42 +020011618 if (PyUnicode_IS_ASCII(self)) {
11619 kind = PyUnicode_KIND(self);
11620 data = PyUnicode_1BYTE_DATA(self);
11621 return unicode_fromascii(data + start, length);
11622 }
11623 else {
11624 kind = PyUnicode_KIND(self);
11625 data = PyUnicode_1BYTE_DATA(self);
11626 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011627 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020011628 length);
11629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630}
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631
11632static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011633do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 int kind;
11636 void *data;
11637 Py_ssize_t len, i, j;
11638
11639 if (PyUnicode_READY(self) == -1)
11640 return NULL;
11641
11642 kind = PyUnicode_KIND(self);
11643 data = PyUnicode_DATA(self);
11644 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011645
Benjamin Peterson14339b62009-01-31 16:36:08 +000011646 i = 0;
11647 if (striptype != RIGHTSTRIP) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011648 while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011649 i++;
11650 }
11651 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011652
Benjamin Peterson14339b62009-01-31 16:36:08 +000011653 j = len;
11654 if (striptype != LEFTSTRIP) {
11655 do {
11656 j--;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011658 j++;
11659 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011660
Victor Stinner7931d9a2011-11-04 00:22:48 +010011661 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662}
11663
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011664
11665static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011666do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011667{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011668 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011669
Benjamin Peterson14339b62009-01-31 16:36:08 +000011670 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11671 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011672
Benjamin Peterson14339b62009-01-31 16:36:08 +000011673 if (sep != NULL && sep != Py_None) {
11674 if (PyUnicode_Check(sep))
11675 return _PyUnicode_XStrip(self, striptype, sep);
11676 else {
11677 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011678 "%s arg must be None or str",
11679 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000011680 return NULL;
11681 }
11682 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011683
Benjamin Peterson14339b62009-01-31 16:36:08 +000011684 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011685}
11686
11687
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011688PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011690\n\
11691Return a copy of the string S with leading and trailing\n\
11692whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011693If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011694
11695static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011696unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011697{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011698 if (PyTuple_GET_SIZE(args) == 0)
11699 return do_strip(self, BOTHSTRIP); /* Common case */
11700 else
11701 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011702}
11703
11704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011705PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011706 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011707\n\
11708Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011709If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011710
11711static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011712unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011713{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011714 if (PyTuple_GET_SIZE(args) == 0)
11715 return do_strip(self, LEFTSTRIP); /* Common case */
11716 else
11717 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011718}
11719
11720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011721PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011723\n\
11724Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011725If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011726
11727static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011729{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011730 if (PyTuple_GET_SIZE(args) == 0)
11731 return do_strip(self, RIGHTSTRIP); /* Common case */
11732 else
11733 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011734}
11735
11736
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011738unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011740 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011742
Georg Brandl222de0f2009-04-12 12:01:50 +000011743 if (len < 1) {
11744 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +020011745 return unicode_empty;
Georg Brandl222de0f2009-04-12 12:01:50 +000011746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Tim Peters7a29bd52001-09-12 03:03:31 +000011748 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 /* no repeat, return original string */
11750 Py_INCREF(str);
Victor Stinner7931d9a2011-11-04 00:22:48 +010011751 return str;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011752 }
Tim Peters8f422462000-09-09 06:13:41 +000011753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 if (PyUnicode_READY(str) == -1)
11755 return NULL;
11756
Victor Stinnerc759f3e2011-10-01 03:09:58 +020011757 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020011758 PyErr_SetString(PyExc_OverflowError,
11759 "repeated string is too long");
11760 return NULL;
11761 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011763
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011764 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765 if (!u)
11766 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020011767 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (PyUnicode_GET_LENGTH(str) == 1) {
11770 const int kind = PyUnicode_KIND(str);
11771 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11772 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020011773 if (kind == PyUnicode_1BYTE_KIND)
11774 memset(to, (unsigned char)fill_char, len);
11775 else {
11776 for (n = 0; n < len; ++n)
11777 PyUnicode_WRITE(kind, to, n, fill_char);
11778 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 }
11780 else {
11781 /* number of characters copied this far */
11782 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011783 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011784 char *to = (char *) PyUnicode_DATA(u);
11785 Py_MEMCPY(to, PyUnicode_DATA(str),
11786 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011788 n = (done <= nchars-done) ? done : nchars-done;
11789 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011790 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 }
11793
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011794 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011795 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796}
11797
Alexander Belopolsky40018472011-02-26 01:02:56 +000011798PyObject *
11799PyUnicode_Replace(PyObject *obj,
11800 PyObject *subobj,
11801 PyObject *replobj,
11802 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803{
11804 PyObject *self;
11805 PyObject *str1;
11806 PyObject *str2;
11807 PyObject *result;
11808
11809 self = PyUnicode_FromObject(obj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011810 if (self == NULL || PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812 str1 = PyUnicode_FromObject(subobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011813 if (str1 == NULL || PyUnicode_READY(str1) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011814 Py_DECREF(self);
11815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011816 }
11817 str2 = PyUnicode_FromObject(replobj);
Victor Stinnere9a29352011-10-01 02:14:59 +020011818 if (str2 == NULL || PyUnicode_READY(str2)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 Py_DECREF(self);
11820 Py_DECREF(str1);
11821 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824 Py_DECREF(self);
11825 Py_DECREF(str1);
11826 Py_DECREF(str2);
11827 return result;
11828}
11829
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011830PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000011831 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832\n\
11833Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000011834old replaced by new. If the optional argument count is\n\
11835given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836
11837static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 PyObject *str1;
11841 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011842 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 PyObject *result;
11844
Martin v. Löwis18e16552006-02-15 17:27:45 +000011845 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011846 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (!PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000011848 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 str1 = PyUnicode_FromObject(str1);
11850 if (str1 == NULL || PyUnicode_READY(str1) == -1)
11851 return NULL;
11852 str2 = PyUnicode_FromObject(str2);
Victor Stinnere9a29352011-10-01 02:14:59 +020011853 if (str2 == NULL || PyUnicode_READY(str2) == -1) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 Py_DECREF(str1);
11855 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000011856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
11858 result = replace(self, str1, str2, maxcount);
11859
11860 Py_DECREF(str1);
11861 Py_DECREF(str2);
11862 return result;
11863}
11864
Alexander Belopolsky40018472011-02-26 01:02:56 +000011865static PyObject *
11866unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867{
Walter Dörwald79e913e2007-05-12 11:08:06 +000011868 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 Py_ssize_t isize;
11870 Py_ssize_t osize, squote, dquote, i, o;
11871 Py_UCS4 max, quote;
11872 int ikind, okind;
11873 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000011874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011875 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000011876 return NULL;
11877
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 isize = PyUnicode_GET_LENGTH(unicode);
11879 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 /* Compute length of output, quote characters, and
11882 maximum character */
11883 osize = 2; /* quotes */
11884 max = 127;
11885 squote = dquote = 0;
11886 ikind = PyUnicode_KIND(unicode);
11887 for (i = 0; i < isize; i++) {
11888 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11889 switch (ch) {
11890 case '\'': squote++; osize++; break;
11891 case '"': dquote++; osize++; break;
11892 case '\\': case '\t': case '\r': case '\n':
11893 osize += 2; break;
11894 default:
11895 /* Fast-path ASCII */
11896 if (ch < ' ' || ch == 0x7f)
11897 osize += 4; /* \xHH */
11898 else if (ch < 0x7f)
11899 osize++;
11900 else if (Py_UNICODE_ISPRINTABLE(ch)) {
11901 osize++;
11902 max = ch > max ? ch : max;
11903 }
11904 else if (ch < 0x100)
11905 osize += 4; /* \xHH */
11906 else if (ch < 0x10000)
11907 osize += 6; /* \uHHHH */
11908 else
11909 osize += 10; /* \uHHHHHHHH */
11910 }
11911 }
11912
11913 quote = '\'';
11914 if (squote) {
11915 if (dquote)
11916 /* Both squote and dquote present. Use squote,
11917 and escape them */
11918 osize += squote;
11919 else
11920 quote = '"';
11921 }
11922
11923 repr = PyUnicode_New(osize, max);
11924 if (repr == NULL)
11925 return NULL;
11926 okind = PyUnicode_KIND(repr);
11927 odata = PyUnicode_DATA(repr);
11928
11929 PyUnicode_WRITE(okind, odata, 0, quote);
11930 PyUnicode_WRITE(okind, odata, osize-1, quote);
11931
11932 for (i = 0, o = 1; i < isize; i++) {
11933 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011934
11935 /* Escape quotes and backslashes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 if ((ch == quote) || (ch == '\\')) {
11937 PyUnicode_WRITE(okind, odata, o++, '\\');
11938 PyUnicode_WRITE(okind, odata, o++, ch);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011939 continue;
11940 }
11941
Benjamin Peterson29060642009-01-31 22:14:21 +000011942 /* Map special whitespace to '\t', \n', '\r' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011943 if (ch == '\t') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011944 PyUnicode_WRITE(okind, odata, o++, '\\');
11945 PyUnicode_WRITE(okind, odata, o++, 't');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011946 }
11947 else if (ch == '\n') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 PyUnicode_WRITE(okind, odata, o++, '\\');
11949 PyUnicode_WRITE(okind, odata, o++, 'n');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011950 }
11951 else if (ch == '\r') {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 PyUnicode_WRITE(okind, odata, o++, '\\');
11953 PyUnicode_WRITE(okind, odata, o++, 'r');
Walter Dörwald79e913e2007-05-12 11:08:06 +000011954 }
11955
11956 /* Map non-printable US ASCII to '\xhh' */
Georg Brandl559e5d72008-06-11 18:37:52 +000011957 else if (ch < ' ' || ch == 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 PyUnicode_WRITE(okind, odata, o++, '\\');
11959 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011960 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11961 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Walter Dörwald79e913e2007-05-12 11:08:06 +000011962 }
11963
Georg Brandl559e5d72008-06-11 18:37:52 +000011964 /* Copy ASCII characters as-is */
11965 else if (ch < 0x7F) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000011967 }
11968
Benjamin Peterson29060642009-01-31 22:14:21 +000011969 /* Non-ASCII characters */
Georg Brandl559e5d72008-06-11 18:37:52 +000011970 else {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011971 /* Map Unicode whitespace and control characters
Georg Brandl559e5d72008-06-11 18:37:52 +000011972 (categories Z* and C* except ASCII space)
11973 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011974 if (!Py_UNICODE_ISPRINTABLE(ch)) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011975 /* Map 8-bit characters to '\xhh' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011976 if (ch <= 0xff) {
11977 PyUnicode_WRITE(okind, odata, o++, '\\');
11978 PyUnicode_WRITE(okind, odata, o++, 'x');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011979 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
11980 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011981 }
11982 /* Map 21-bit characters to '\U00xxxxxx' */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 else if (ch >= 0x10000) {
11984 PyUnicode_WRITE(okind, odata, o++, '\\');
11985 PyUnicode_WRITE(okind, odata, o++, 'U');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011986 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
11987 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
11988 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
11989 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
11990 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
11991 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
11992 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
11993 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000011994 }
11995 /* Map 16-bit characters to '\uxxxx' */
11996 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 PyUnicode_WRITE(okind, odata, o++, '\\');
11998 PyUnicode_WRITE(okind, odata, o++, 'u');
Victor Stinnerf5cff562011-10-14 02:13:11 +020011999 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12000 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12001 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12002 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
Georg Brandl559e5d72008-06-11 18:37:52 +000012003 }
12004 }
12005 /* Copy characters as-is */
12006 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012007 PyUnicode_WRITE(okind, odata, o++, ch);
Georg Brandl559e5d72008-06-11 18:37:52 +000012008 }
12009 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012011 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012012 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012013 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012014}
12015
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012016PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018\n\
12019Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012020such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012021arguments start and end are interpreted as in slice notation.\n\
12022\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012023Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012024
12025static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012026unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012028 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012029 Py_ssize_t start;
12030 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012031 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012032
Jesus Ceaac451502011-04-20 17:09:23 +020012033 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12034 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012035 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039 if (PyUnicode_READY(substring) == -1)
12040 return NULL;
12041
Victor Stinner7931d9a2011-11-04 00:22:48 +010012042 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043
12044 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 if (result == -2)
12047 return NULL;
12048
Christian Heimes217cfd12007-12-02 14:31:20 +000012049 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012050}
12051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012052PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012053 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012054\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012055Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012056
12057static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012060 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012061 Py_ssize_t start;
12062 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012063 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064
Jesus Ceaac451502011-04-20 17:09:23 +020012065 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12066 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012067 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (PyUnicode_READY(self) == -1)
12070 return NULL;
12071 if (PyUnicode_READY(substring) == -1)
12072 return NULL;
12073
Victor Stinner7931d9a2011-11-04 00:22:48 +010012074 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
12076 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if (result == -2)
12079 return NULL;
12080
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 if (result < 0) {
12082 PyErr_SetString(PyExc_ValueError, "substring not found");
12083 return NULL;
12084 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085
Christian Heimes217cfd12007-12-02 14:31:20 +000012086 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087}
12088
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012089PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012090 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012092Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012093done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094
12095static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012096unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012098 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012099 Py_UCS4 fillchar = ' ';
12100
Victor Stinnere9a29352011-10-01 02:14:59 +020012101 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012103
Victor Stinnere9a29352011-10-01 02:14:59 +020012104 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105 return NULL;
12106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012109 return self;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110 }
12111
Victor Stinner7931d9a2011-11-04 00:22:48 +010012112 return pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113}
12114
Alexander Belopolsky40018472011-02-26 01:02:56 +000012115PyObject *
12116PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117{
12118 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012119
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 s = PyUnicode_FromObject(s);
12121 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012123 if (sep != NULL) {
12124 sep = PyUnicode_FromObject(sep);
12125 if (sep == NULL) {
12126 Py_DECREF(s);
12127 return NULL;
12128 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012129 }
12130
Victor Stinner9310abb2011-10-05 00:59:23 +020012131 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132
12133 Py_DECREF(s);
12134 Py_XDECREF(sep);
12135 return result;
12136}
12137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012138PyDoc_STRVAR(split__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 "S.split([sep[, maxsplit]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140\n\
12141Return a list of the words in S, using sep as the\n\
12142delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012143splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012144whitespace string is a separator and empty strings are\n\
12145removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012146
12147static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012148unicode_split(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149{
12150 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012151 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012152
Martin v. Löwis18e16552006-02-15 17:27:45 +000012153 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154 return NULL;
12155
12156 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012159 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012161 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162}
12163
Thomas Wouters477c8d52006-05-27 19:21:47 +000012164PyObject *
12165PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12166{
12167 PyObject* str_obj;
12168 PyObject* sep_obj;
12169 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012170 int kind1, kind2, kind;
12171 void *buf1 = NULL, *buf2 = NULL;
12172 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012173
12174 str_obj = PyUnicode_FromObject(str_in);
Victor Stinnere9a29352011-10-01 02:14:59 +020012175 if (!str_obj || PyUnicode_READY(str_obj) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012176 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012177 sep_obj = PyUnicode_FromObject(sep_in);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000012179 Py_DECREF(str_obj);
12180 return NULL;
12181 }
12182
Victor Stinner14f8f022011-10-05 20:58:25 +020012183 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012185 kind = Py_MAX(kind1, kind2);
12186 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012187 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012188 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012189 if (!buf1)
12190 goto onError;
12191 buf2 = PyUnicode_DATA(sep_obj);
12192 if (kind2 != kind)
12193 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12194 if (!buf2)
12195 goto onError;
12196 len1 = PyUnicode_GET_LENGTH(str_obj);
12197 len2 = PyUnicode_GET_LENGTH(sep_obj);
12198
Victor Stinner14f8f022011-10-05 20:58:25 +020012199 switch(PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012201 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12202 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12203 else
12204 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 break;
12206 case PyUnicode_2BYTE_KIND:
12207 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12208 break;
12209 case PyUnicode_4BYTE_KIND:
12210 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12211 break;
12212 default:
12213 assert(0);
12214 out = 0;
12215 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012216
12217 Py_DECREF(sep_obj);
12218 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 if (kind1 != kind)
12220 PyMem_Free(buf1);
12221 if (kind2 != kind)
12222 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012223
12224 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 onError:
12226 Py_DECREF(sep_obj);
12227 Py_DECREF(str_obj);
12228 if (kind1 != kind && buf1)
12229 PyMem_Free(buf1);
12230 if (kind2 != kind && buf2)
12231 PyMem_Free(buf2);
12232 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012233}
12234
12235
12236PyObject *
12237PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12238{
12239 PyObject* str_obj;
12240 PyObject* sep_obj;
12241 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 int kind1, kind2, kind;
12243 void *buf1 = NULL, *buf2 = NULL;
12244 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012245
12246 str_obj = PyUnicode_FromObject(str_in);
12247 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012249 sep_obj = PyUnicode_FromObject(sep_in);
12250 if (!sep_obj) {
12251 Py_DECREF(str_obj);
12252 return NULL;
12253 }
12254
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012255 kind1 = PyUnicode_KIND(str_in);
12256 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012257 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 buf1 = PyUnicode_DATA(str_in);
12259 if (kind1 != kind)
12260 buf1 = _PyUnicode_AsKind(str_in, kind);
12261 if (!buf1)
12262 goto onError;
12263 buf2 = PyUnicode_DATA(sep_obj);
12264 if (kind2 != kind)
12265 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12266 if (!buf2)
12267 goto onError;
12268 len1 = PyUnicode_GET_LENGTH(str_obj);
12269 len2 = PyUnicode_GET_LENGTH(sep_obj);
12270
12271 switch(PyUnicode_KIND(str_in)) {
12272 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012273 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12274 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12275 else
12276 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 break;
12278 case PyUnicode_2BYTE_KIND:
12279 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12280 break;
12281 case PyUnicode_4BYTE_KIND:
12282 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12283 break;
12284 default:
12285 assert(0);
12286 out = 0;
12287 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012288
12289 Py_DECREF(sep_obj);
12290 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 if (kind1 != kind)
12292 PyMem_Free(buf1);
12293 if (kind2 != kind)
12294 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012295
12296 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 onError:
12298 Py_DECREF(sep_obj);
12299 Py_DECREF(str_obj);
12300 if (kind1 != kind && buf1)
12301 PyMem_Free(buf1);
12302 if (kind2 != kind && buf2)
12303 PyMem_Free(buf2);
12304 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012305}
12306
12307PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012308 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012309\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012310Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012312found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313
12314static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012315unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012316{
Victor Stinner9310abb2011-10-05 00:59:23 +020012317 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318}
12319
12320PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012321 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012322\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012323Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012324the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012325separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012326
12327static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012328unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012329{
Victor Stinner9310abb2011-10-05 00:59:23 +020012330 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012331}
12332
Alexander Belopolsky40018472011-02-26 01:02:56 +000012333PyObject *
12334PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012335{
12336 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012337
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012338 s = PyUnicode_FromObject(s);
12339 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012340 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012341 if (sep != NULL) {
12342 sep = PyUnicode_FromObject(sep);
12343 if (sep == NULL) {
12344 Py_DECREF(s);
12345 return NULL;
12346 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012347 }
12348
Victor Stinner9310abb2011-10-05 00:59:23 +020012349 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012350
12351 Py_DECREF(s);
12352 Py_XDECREF(sep);
12353 return result;
12354}
12355
12356PyDoc_STRVAR(rsplit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012357 "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012358\n\
12359Return a list of the words in S, using sep as the\n\
12360delimiter string, starting at the end of the string and\n\
12361working to the front. If maxsplit is given, at most maxsplit\n\
12362splits are done. If sep is not specified, any whitespace string\n\
12363is a separator.");
12364
12365static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012366unicode_rsplit(PyObject *self, PyObject *args)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012367{
12368 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012369 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012370
Martin v. Löwis18e16552006-02-15 17:27:45 +000012371 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012372 return NULL;
12373
12374 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012375 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012376 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012377 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012378 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012379 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012380}
12381
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012382PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012383 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012384\n\
12385Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012386Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012387is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
12389static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012390unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012392 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012393 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012394
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012395 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12396 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397 return NULL;
12398
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012399 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400}
12401
12402static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012403PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012404{
Walter Dörwald346737f2007-05-31 10:44:43 +000012405 if (PyUnicode_CheckExact(self)) {
12406 Py_INCREF(self);
12407 return self;
12408 } else
12409 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinner034f6cf2011-09-30 02:26:44 +020012410 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411}
12412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012413PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012414 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012415\n\
12416Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012417and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012418
12419static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012420unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422 return fixup(self, fixswapcase);
12423}
12424
Georg Brandlceee0772007-11-27 23:48:05 +000012425PyDoc_STRVAR(maketrans__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012426 "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012427\n\
12428Return a translation table usable for str.translate().\n\
12429If there is only one argument, it must be a dictionary mapping Unicode\n\
12430ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012431Character keys will be then converted to ordinals.\n\
Georg Brandlceee0772007-11-27 23:48:05 +000012432If there are two arguments, they must be strings of equal length, and\n\
12433in the resulting dictionary, each character in x will be mapped to the\n\
12434character at the same position in y. If there is a third argument, it\n\
12435must be a string, whose characters will be mapped to None in the result.");
12436
12437static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012438unicode_maketrans(PyObject *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012439{
12440 PyObject *x, *y = NULL, *z = NULL;
12441 PyObject *new = NULL, *key, *value;
12442 Py_ssize_t i = 0;
12443 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444
Georg Brandlceee0772007-11-27 23:48:05 +000012445 if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12446 return NULL;
12447 new = PyDict_New();
12448 if (!new)
12449 return NULL;
12450 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012451 int x_kind, y_kind, z_kind;
12452 void *x_data, *y_data, *z_data;
12453
Georg Brandlceee0772007-11-27 23:48:05 +000012454 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012455 if (!PyUnicode_Check(x)) {
12456 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12457 "be a string if there is a second argument");
12458 goto err;
12459 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012460 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012461 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12462 "arguments must have equal length");
12463 goto err;
12464 }
12465 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466 x_kind = PyUnicode_KIND(x);
12467 y_kind = PyUnicode_KIND(y);
12468 x_data = PyUnicode_DATA(x);
12469 y_data = PyUnicode_DATA(y);
12470 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12471 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12472 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012473 if (!key || !value)
12474 goto err;
12475 res = PyDict_SetItem(new, key, value);
12476 Py_DECREF(key);
12477 Py_DECREF(value);
12478 if (res < 0)
12479 goto err;
12480 }
12481 /* create entries for deleting chars in z */
12482 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 z_kind = PyUnicode_KIND(z);
12484 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012485 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012487 if (!key)
12488 goto err;
12489 res = PyDict_SetItem(new, key, Py_None);
12490 Py_DECREF(key);
12491 if (res < 0)
12492 goto err;
12493 }
12494 }
12495 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 int kind;
12497 void *data;
12498
Georg Brandlceee0772007-11-27 23:48:05 +000012499 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012500 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012501 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12502 "to maketrans it must be a dict");
12503 goto err;
12504 }
12505 /* copy entries into the new dict, converting string keys to int keys */
12506 while (PyDict_Next(x, &i, &key, &value)) {
12507 if (PyUnicode_Check(key)) {
12508 /* convert string keys to integer keys */
12509 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012510 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000012511 PyErr_SetString(PyExc_ValueError, "string keys in translate "
12512 "table must be of length 1");
12513 goto err;
12514 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 kind = PyUnicode_KIND(key);
12516 data = PyUnicode_DATA(key);
12517 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000012518 if (!newkey)
12519 goto err;
12520 res = PyDict_SetItem(new, newkey, value);
12521 Py_DECREF(newkey);
12522 if (res < 0)
12523 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000012524 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012525 /* just keep integer keys */
12526 if (PyDict_SetItem(new, key, value) < 0)
12527 goto err;
12528 } else {
12529 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12530 "be strings or integers");
12531 goto err;
12532 }
12533 }
12534 }
12535 return new;
12536 err:
12537 Py_DECREF(new);
12538 return NULL;
12539}
12540
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012541PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543\n\
12544Return a copy of the string S, where all characters have been mapped\n\
12545through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012546Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000012547Unmapped characters are left untouched. Characters mapped to None\n\
12548are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
12550static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012552{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012553 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554}
12555
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012556PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012557 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012558\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012559Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
12561static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012562unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563{
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564 return fixup(self, fixupper);
12565}
12566
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012567PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012568 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000012570Pad a numeric string S with zeros on the left, to fill a field\n\
12571of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012572
12573static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012574unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012576 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020012577 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012578 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 int kind;
12580 void *data;
12581 Py_UCS4 chr;
12582
12583 if (PyUnicode_READY(self) == -1)
12584 return NULL;
12585
Martin v. Löwis18e16552006-02-15 17:27:45 +000012586 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587 return NULL;
12588
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012589 if (PyUnicode_GET_LENGTH(self) >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012590 if (PyUnicode_CheckExact(self)) {
12591 Py_INCREF(self);
Victor Stinner7931d9a2011-11-04 00:22:48 +010012592 return self;
Walter Dörwald0fe940c2002-04-15 18:42:15 +000012593 }
12594 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012595 return PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596 }
12597
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012598 fill = width - _PyUnicode_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599
12600 u = pad(self, fill, 0, '0');
12601
Walter Dörwald068325e2002-04-15 13:36:47 +000012602 if (u == NULL)
12603 return NULL;
12604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012605 kind = PyUnicode_KIND(u);
12606 data = PyUnicode_DATA(u);
12607 chr = PyUnicode_READ(kind, data, fill);
12608
12609 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012611 PyUnicode_WRITE(kind, data, 0, chr);
12612 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613 }
12614
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012615 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010012616 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618
12619#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012620static PyObject *
12621unicode__decimal2ascii(PyObject *self)
12622{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012624}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012625#endif
12626
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012627PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012628 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012630Return True if S starts with the specified prefix, False otherwise.\n\
12631With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012632With optional end, stop comparing S at that position.\n\
12633prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634
12635static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012636unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012637 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012638{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012639 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012640 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012641 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012642 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012643 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012644
Jesus Ceaac451502011-04-20 17:09:23 +020012645 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012647 if (PyTuple_Check(subobj)) {
12648 Py_ssize_t i;
12649 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012650 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012651 if (substring == NULL)
12652 return NULL;
12653 result = tailmatch(self, substring, start, end, -1);
12654 Py_DECREF(substring);
12655 if (result) {
12656 Py_RETURN_TRUE;
12657 }
12658 }
12659 /* nothing matched */
12660 Py_RETURN_FALSE;
12661 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012662 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012663 if (substring == NULL) {
12664 if (PyErr_ExceptionMatches(PyExc_TypeError))
12665 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12666 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012667 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012668 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012669 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012671 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672}
12673
12674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012675PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012676 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012677\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000012678Return True if S ends with the specified suffix, False otherwise.\n\
12679With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012680With optional end, stop comparing S at that position.\n\
12681suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
12683static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012684unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000012685 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012687 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012688 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012689 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012690 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012691 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
Jesus Ceaac451502011-04-20 17:09:23 +020012693 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000012694 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012695 if (PyTuple_Check(subobj)) {
12696 Py_ssize_t i;
12697 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012698 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000012699 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012700 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012701 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012702 result = tailmatch(self, substring, start, end, +1);
12703 Py_DECREF(substring);
12704 if (result) {
12705 Py_RETURN_TRUE;
12706 }
12707 }
12708 Py_RETURN_FALSE;
12709 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012710 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030012711 if (substring == NULL) {
12712 if (PyErr_ExceptionMatches(PyExc_TypeError))
12713 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12714 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030012716 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012717 result = tailmatch(self, substring, start, end, +1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012718 Py_DECREF(substring);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000012719 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720}
12721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012722#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000012723
12724PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012726\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012727Return a formatted version of S, using substitutions from args and kwargs.\n\
12728The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000012729
Eric Smith27bbca62010-11-04 17:06:58 +000012730PyDoc_STRVAR(format_map__doc__,
12731 "S.format_map(mapping) -> str\n\
12732\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012733Return a formatted version of S, using substitutions from mapping.\n\
12734The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000012735
Eric Smith4a7d76d2008-05-30 18:10:19 +000012736static PyObject *
12737unicode__format__(PyObject* self, PyObject* args)
12738{
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012739 PyObject *format_spec, *out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012740
12741 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12742 return NULL;
12743
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012744 out = _PyUnicode_FormatAdvanced(self, format_spec, 0,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 PyUnicode_GET_LENGTH(format_spec));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012746 return out;
Eric Smith4a7d76d2008-05-30 18:10:19 +000012747}
12748
Eric Smith8c663262007-08-25 02:26:07 +000012749PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012750 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000012751\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000012752Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000012753
12754static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012755unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012756{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757 Py_ssize_t size;
12758
12759 /* If it's a compact object, account for base structure +
12760 character data. */
12761 if (PyUnicode_IS_COMPACT_ASCII(v))
12762 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12763 else if (PyUnicode_IS_COMPACT(v))
12764 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012765 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012766 else {
12767 /* If it is a two-block object, account for base object, and
12768 for character block if present. */
12769 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020012770 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012771 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012772 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773 }
12774 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020012775 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020012776 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020012778 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020012779 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780
12781 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012782}
12783
12784PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012786
12787static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020012788unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012789{
Victor Stinner034f6cf2011-09-30 02:26:44 +020012790 PyObject *copy = PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 if (!copy)
12792 return NULL;
12793 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000012794}
12795
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796static PyMethodDef unicode_methods[] = {
12797
12798 /* Order is according to common usage: often used methods should
12799 appear first, since lookup is done sequentially. */
12800
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000012801 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012802 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12803 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012804 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012805 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12806 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12807 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12808 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12809 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12810 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12811 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012813 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12814 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12815 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012816 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012817 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12818 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12819 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012820 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012822 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012823 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012824 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12825 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12826 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12827 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12828 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12829 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12830 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12831 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12832 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12833 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12834 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12835 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12836 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12837 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000012838 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000012839 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012840 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000012841 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000012842 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000012843 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Georg Brandlceee0772007-11-27 23:48:05 +000012844 {"maketrans", (PyCFunction) unicode_maketrans,
12845 METH_VARARGS | METH_STATIC, maketrans__doc__},
Georg Brandlc28e1fa2008-06-10 19:20:26 +000012846 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000012847#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000012848 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849#endif
12850
12851#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012852 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000012853 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854#endif
12855
Benjamin Peterson14339b62009-01-31 16:36:08 +000012856 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857 {NULL, NULL}
12858};
12859
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012860static PyObject *
12861unicode_mod(PyObject *v, PyObject *w)
12862{
Brian Curtindfc80e32011-08-10 20:28:54 -050012863 if (!PyUnicode_Check(v))
12864 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000012865 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012866}
12867
12868static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012869 0, /*nb_add*/
12870 0, /*nb_subtract*/
12871 0, /*nb_multiply*/
12872 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000012873};
12874
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012876 (lenfunc) unicode_length, /* sq_length */
12877 PyUnicode_Concat, /* sq_concat */
12878 (ssizeargfunc) unicode_repeat, /* sq_repeat */
12879 (ssizeargfunc) unicode_getitem, /* sq_item */
12880 0, /* sq_slice */
12881 0, /* sq_ass_item */
12882 0, /* sq_ass_slice */
12883 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884};
12885
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012886static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012887unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012889 if (PyUnicode_READY(self) == -1)
12890 return NULL;
12891
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000012892 if (PyIndex_Check(item)) {
12893 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012894 if (i == -1 && PyErr_Occurred())
12895 return NULL;
12896 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012898 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012899 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000012900 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012901 PyObject *result;
12902 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012903 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012904 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012906 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000012907 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012908 return NULL;
12909 }
12910
12911 if (slicelength <= 0) {
Victor Stinner382955f2011-12-11 21:44:00 +010012912 Py_INCREF(unicode_empty);
12913 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012914 } else if (start == 0 && step == 1 &&
12915 slicelength == PyUnicode_GET_LENGTH(self) &&
Thomas Woutersed03b412007-08-28 21:37:11 +000012916 PyUnicode_CheckExact(self)) {
12917 Py_INCREF(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012918 return self;
Thomas Woutersed03b412007-08-28 21:37:11 +000012919 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012920 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020012921 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012922 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012923 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012924 src_kind = PyUnicode_KIND(self);
12925 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020012926 if (!PyUnicode_IS_ASCII(self)) {
12927 kind_limit = kind_maxchar_limit(src_kind);
12928 max_char = 0;
12929 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12930 ch = PyUnicode_READ(src_kind, src_data, cur);
12931 if (ch > max_char) {
12932 max_char = ch;
12933 if (max_char >= kind_limit)
12934 break;
12935 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020012936 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012937 }
Victor Stinner55c99112011-10-13 01:17:06 +020012938 else
12939 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012940 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012941 if (result == NULL)
12942 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012943 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012944 dest_data = PyUnicode_DATA(result);
12945
12946 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020012947 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12948 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012949 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012950 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020012951 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012952 } else {
12953 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12954 return NULL;
12955 }
12956}
12957
12958static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012959 (lenfunc)unicode_length, /* mp_length */
12960 (binaryfunc)unicode_subscript, /* mp_subscript */
12961 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000012962};
12963
Guido van Rossumd57fd912000-03-10 22:53:23 +000012964
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965/* Helpers for PyUnicode_Format() */
12966
12967static PyObject *
Martin v. Löwis18e16552006-02-15 17:27:45 +000012968getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012969{
Martin v. Löwis18e16552006-02-15 17:27:45 +000012970 Py_ssize_t argidx = *p_argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971 if (argidx < arglen) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012972 (*p_argidx)++;
12973 if (arglen < 0)
12974 return args;
12975 else
12976 return PyTuple_GetItem(args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977 }
12978 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012979 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980 return NULL;
12981}
12982
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012983/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000012984
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012985static PyObject *
12986formatfloat(PyObject *v, int flags, int prec, int type)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012987{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012988 char *p;
12989 PyObject *result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012990 double x;
Tim Petersced69f82003-09-16 20:30:58 +000012991
Guido van Rossumd57fd912000-03-10 22:53:23 +000012992 x = PyFloat_AsDouble(v);
12993 if (x == -1.0 && PyErr_Occurred())
Mark Dickinsonf489caf2009-05-01 11:42:00 +000012994 return NULL;
12995
Guido van Rossumd57fd912000-03-10 22:53:23 +000012996 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000012998
Eric Smith0923d1d2009-04-16 20:16:10 +000012999 p = PyOS_double_to_string(x, type, prec,
13000 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013001 if (p == NULL)
13002 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
Eric Smith0923d1d2009-04-16 20:16:10 +000013004 PyMem_Free(p);
13005 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006}
13007
Tim Peters38fd5b62000-09-21 05:43:11 +000013008static PyObject*
13009formatlong(PyObject *val, int flags, int prec, int type)
13010{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013011 char *buf;
13012 int len;
13013 PyObject *str; /* temporary string object. */
13014 PyObject *result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013015
Benjamin Peterson14339b62009-01-31 16:36:08 +000013016 str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
13017 if (!str)
13018 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 result = PyUnicode_DecodeASCII(buf, len, NULL);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013020 Py_DECREF(str);
13021 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013022}
13023
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013024static Py_UCS4
13025formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013026{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013027 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013028 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013029 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013030 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000013031 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013032 goto onError;
13033 }
13034 else {
13035 /* Integer input truncated to a character */
13036 long x;
13037 x = PyLong_AsLong(v);
13038 if (x == -1 && PyErr_Occurred())
13039 goto onError;
13040
Victor Stinner8faf8212011-12-08 22:14:11 +010013041 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 PyErr_SetString(PyExc_OverflowError,
13043 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013044 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013045 }
13046
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013047 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013048 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000013049
Benjamin Peterson29060642009-01-31 22:14:21 +000013050 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000013051 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013053 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054}
13055
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013056static int
13057repeat_accumulate(_PyAccu *acc, PyObject *obj, Py_ssize_t count)
13058{
13059 int r;
13060 assert(count > 0);
13061 assert(PyUnicode_Check(obj));
13062 if (count > 5) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013063 PyObject *repeated = unicode_repeat(obj, count);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013064 if (repeated == NULL)
13065 return -1;
13066 r = _PyAccu_Accumulate(acc, repeated);
13067 Py_DECREF(repeated);
13068 return r;
13069 }
13070 else {
13071 do {
13072 if (_PyAccu_Accumulate(acc, obj))
13073 return -1;
13074 } while (--count);
13075 return 0;
13076 }
13077}
13078
Alexander Belopolsky40018472011-02-26 01:02:56 +000013079PyObject *
13080PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 void *fmt;
13083 int fmtkind;
13084 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 int kind;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013086 int r;
13087 Py_ssize_t fmtcnt, fmtpos, arglen, argidx;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088 int args_owned = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089 PyObject *dict = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013090 PyObject *temp = NULL;
13091 PyObject *second = NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013092 PyObject *uformat;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013093 _PyAccu acc;
13094 static PyObject *plus, *minus, *blank, *zero, *percent;
13095
13096 if (!plus && !(plus = get_latin1_char('+')))
13097 return NULL;
13098 if (!minus && !(minus = get_latin1_char('-')))
13099 return NULL;
13100 if (!blank && !(blank = get_latin1_char(' ')))
13101 return NULL;
13102 if (!zero && !(zero = get_latin1_char('0')))
13103 return NULL;
13104 if (!percent && !(percent = get_latin1_char('%')))
13105 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000013106
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013108 PyErr_BadInternalCall();
13109 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013111 uformat = PyUnicode_FromObject(format);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 if (uformat == NULL || PyUnicode_READY(uformat) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013113 return NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013114 if (_PyAccu_Init(&acc))
13115 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013116 fmt = PyUnicode_DATA(uformat);
13117 fmtkind = PyUnicode_KIND(uformat);
13118 fmtcnt = PyUnicode_GET_LENGTH(uformat);
13119 fmtpos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121 if (PyTuple_Check(args)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013122 arglen = PyTuple_Size(args);
13123 argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124 }
13125 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 arglen = -1;
13127 argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128 }
Christian Heimes90aa7642007-12-19 02:45:37 +000013129 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
Christian Heimesf3863112007-11-22 07:46:41 +000013130 !PyUnicode_Check(args))
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 dict = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
13133 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013134 if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013135 PyObject *nonfmt;
13136 Py_ssize_t nonfmtpos;
13137 nonfmtpos = fmtpos++;
13138 while (fmtcnt >= 0 &&
13139 PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
13140 fmtpos++;
13141 fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013142 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013143 nonfmt = PyUnicode_Substring(uformat, nonfmtpos, fmtpos);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013144 if (nonfmt == NULL)
13145 goto onError;
13146 r = _PyAccu_Accumulate(&acc, nonfmt);
13147 Py_DECREF(nonfmt);
13148 if (r)
13149 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013150 }
13151 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013152 /* Got a format specifier */
13153 int flags = 0;
13154 Py_ssize_t width = -1;
13155 int prec = -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 Py_UCS4 c = '\0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013157 Py_UCS4 fill, sign;
Benjamin Peterson29060642009-01-31 22:14:21 +000013158 int isnumok;
13159 PyObject *v = NULL;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013160 void *pbuf = NULL;
13161 Py_ssize_t pindex, len;
13162 PyObject *signobj = NULL, *fillobj = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 fmtpos++;
13165 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
13166 Py_ssize_t keystart;
Benjamin Peterson29060642009-01-31 22:14:21 +000013167 Py_ssize_t keylen;
13168 PyObject *key;
13169 int pcount = 1;
Christian Heimesa612dc02008-02-24 13:08:18 +000013170
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 if (dict == NULL) {
13172 PyErr_SetString(PyExc_TypeError,
13173 "format requires a mapping");
13174 goto onError;
13175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013176 ++fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013177 --fmtcnt;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178 keystart = fmtpos;
Benjamin Peterson29060642009-01-31 22:14:21 +000013179 /* Skip over balanced parentheses */
13180 while (pcount > 0 && --fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
Benjamin Peterson29060642009-01-31 22:14:21 +000013182 --pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 ++pcount;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 fmtpos++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013186 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013187 keylen = fmtpos - keystart - 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000013188 if (fmtcnt < 0 || pcount > 0) {
13189 PyErr_SetString(PyExc_ValueError,
13190 "incomplete format key");
13191 goto onError;
13192 }
Victor Stinner7931d9a2011-11-04 00:22:48 +010013193 key = PyUnicode_Substring(uformat,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013194 keystart, keystart + keylen);
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 if (key == NULL)
13196 goto onError;
13197 if (args_owned) {
13198 Py_DECREF(args);
13199 args_owned = 0;
13200 }
13201 args = PyObject_GetItem(dict, key);
13202 Py_DECREF(key);
13203 if (args == NULL) {
13204 goto onError;
13205 }
13206 args_owned = 1;
13207 arglen = -1;
13208 argidx = -2;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013209 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013210 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013211 switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 case '-': flags |= F_LJUST; continue;
13213 case '+': flags |= F_SIGN; continue;
13214 case ' ': flags |= F_BLANK; continue;
13215 case '#': flags |= F_ALT; continue;
13216 case '0': flags |= F_ZERO; continue;
13217 }
13218 break;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013219 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013220 if (c == '*') {
13221 v = getnextarg(args, arglen, &argidx);
13222 if (v == NULL)
13223 goto onError;
13224 if (!PyLong_Check(v)) {
13225 PyErr_SetString(PyExc_TypeError,
13226 "* wants int");
13227 goto onError;
13228 }
13229 width = PyLong_AsLong(v);
13230 if (width == -1 && PyErr_Occurred())
13231 goto onError;
13232 if (width < 0) {
13233 flags |= F_LJUST;
13234 width = -width;
13235 }
13236 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013238 }
13239 else if (c >= '0' && c <= '9') {
13240 width = c - '0';
13241 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013242 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013243 if (c < '0' || c > '9')
13244 break;
13245 if ((width*10) / 10 != width) {
13246 PyErr_SetString(PyExc_ValueError,
13247 "width too big");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013248 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013249 }
13250 width = width*10 + (c - '0');
13251 }
13252 }
13253 if (c == '.') {
13254 prec = 0;
13255 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013257 if (c == '*') {
13258 v = getnextarg(args, arglen, &argidx);
13259 if (v == NULL)
13260 goto onError;
13261 if (!PyLong_Check(v)) {
13262 PyErr_SetString(PyExc_TypeError,
13263 "* wants int");
13264 goto onError;
13265 }
13266 prec = PyLong_AsLong(v);
13267 if (prec == -1 && PyErr_Occurred())
13268 goto onError;
13269 if (prec < 0)
13270 prec = 0;
13271 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013272 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013273 }
13274 else if (c >= '0' && c <= '9') {
13275 prec = c - '0';
13276 while (--fmtcnt >= 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013277 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013278 if (c < '0' || c > '9')
13279 break;
13280 if ((prec*10) / 10 != prec) {
13281 PyErr_SetString(PyExc_ValueError,
13282 "prec too big");
13283 goto onError;
13284 }
13285 prec = prec*10 + (c - '0');
13286 }
13287 }
13288 } /* prec */
13289 if (fmtcnt >= 0) {
13290 if (c == 'h' || c == 'l' || c == 'L') {
13291 if (--fmtcnt >= 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
Benjamin Peterson29060642009-01-31 22:14:21 +000013293 }
13294 }
13295 if (fmtcnt < 0) {
13296 PyErr_SetString(PyExc_ValueError,
13297 "incomplete format");
13298 goto onError;
13299 }
13300 if (c != '%') {
13301 v = getnextarg(args, arglen, &argidx);
13302 if (v == NULL)
13303 goto onError;
13304 }
13305 sign = 0;
13306 fill = ' ';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013307 fillobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 switch (c) {
13309
13310 case '%':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013311 _PyAccu_Accumulate(&acc, percent);
13312 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +000013313
13314 case 's':
13315 case 'r':
13316 case 'a':
Victor Stinner808fc0a2010-03-22 12:50:40 +000013317 if (PyUnicode_CheckExact(v) && c == 's') {
Benjamin Peterson29060642009-01-31 22:14:21 +000013318 temp = v;
13319 Py_INCREF(temp);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013320 }
13321 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000013322 if (c == 's')
13323 temp = PyObject_Str(v);
13324 else if (c == 'r')
13325 temp = PyObject_Repr(v);
13326 else
13327 temp = PyObject_ASCII(v);
13328 if (temp == NULL)
13329 goto onError;
13330 if (PyUnicode_Check(temp))
13331 /* nothing to do */;
13332 else {
13333 Py_DECREF(temp);
13334 PyErr_SetString(PyExc_TypeError,
13335 "%s argument has non-string str()");
13336 goto onError;
13337 }
13338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013339 if (PyUnicode_READY(temp) == -1) {
13340 Py_CLEAR(temp);
13341 goto onError;
13342 }
13343 pbuf = PyUnicode_DATA(temp);
13344 kind = PyUnicode_KIND(temp);
13345 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013346 if (prec >= 0 && len > prec)
13347 len = prec;
13348 break;
13349
13350 case 'i':
13351 case 'd':
13352 case 'u':
13353 case 'o':
13354 case 'x':
13355 case 'X':
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 isnumok = 0;
13357 if (PyNumber_Check(v)) {
13358 PyObject *iobj=NULL;
13359
13360 if (PyLong_Check(v)) {
13361 iobj = v;
13362 Py_INCREF(iobj);
13363 }
13364 else {
13365 iobj = PyNumber_Long(v);
13366 }
13367 if (iobj!=NULL) {
13368 if (PyLong_Check(iobj)) {
13369 isnumok = 1;
Senthil Kumaran9ebe08d2011-07-03 21:03:16 -070013370 temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 Py_DECREF(iobj);
13372 if (!temp)
13373 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013374 if (PyUnicode_READY(temp) == -1) {
13375 Py_CLEAR(temp);
13376 goto onError;
13377 }
13378 pbuf = PyUnicode_DATA(temp);
13379 kind = PyUnicode_KIND(temp);
13380 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 sign = 1;
13382 }
13383 else {
13384 Py_DECREF(iobj);
13385 }
13386 }
13387 }
13388 if (!isnumok) {
13389 PyErr_Format(PyExc_TypeError,
13390 "%%%c format: a number is required, "
13391 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
13392 goto onError;
13393 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013394 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013395 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013396 fillobj = zero;
13397 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013398 break;
13399
13400 case 'e':
13401 case 'E':
13402 case 'f':
13403 case 'F':
13404 case 'g':
13405 case 'G':
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013406 temp = formatfloat(v, flags, prec, c);
13407 if (!temp)
Benjamin Peterson29060642009-01-31 22:14:21 +000013408 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 if (PyUnicode_READY(temp) == -1) {
13410 Py_CLEAR(temp);
13411 goto onError;
13412 }
13413 pbuf = PyUnicode_DATA(temp);
13414 kind = PyUnicode_KIND(temp);
13415 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 sign = 1;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013417 if (flags & F_ZERO) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013418 fill = '0';
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013419 fillobj = zero;
13420 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 break;
13422
13423 case 'c':
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013424 {
13425 Py_UCS4 ch = formatchar(v);
13426 if (ch == (Py_UCS4) -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 goto onError;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013428 temp = _PyUnicode_FromUCS4(&ch, 1);
13429 if (temp == NULL)
13430 goto onError;
13431 pbuf = PyUnicode_DATA(temp);
13432 kind = PyUnicode_KIND(temp);
13433 len = PyUnicode_GET_LENGTH(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 break;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013435 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013436
13437 default:
13438 PyErr_Format(PyExc_ValueError,
13439 "unsupported format character '%c' (0x%x) "
13440 "at index %zd",
13441 (31<=c && c<=126) ? (char)c : '?',
13442 (int)c,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 fmtpos - 1);
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 goto onError;
13445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013446 /* pbuf is initialized here. */
13447 pindex = 0;
Benjamin Peterson29060642009-01-31 22:14:21 +000013448 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013449 if (PyUnicode_READ(kind, pbuf, pindex) == '-') {
13450 signobj = minus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013451 len--;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013452 pindex++;
13453 }
13454 else if (PyUnicode_READ(kind, pbuf, pindex) == '+') {
13455 signobj = plus;
13456 len--;
13457 pindex++;
Benjamin Peterson29060642009-01-31 22:14:21 +000013458 }
13459 else if (flags & F_SIGN)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013460 signobj = plus;
Benjamin Peterson29060642009-01-31 22:14:21 +000013461 else if (flags & F_BLANK)
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013462 signobj = blank;
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 else
13464 sign = 0;
13465 }
13466 if (width < len)
13467 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 if (sign) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013469 if (fill != ' ') {
13470 assert(signobj != NULL);
13471 if (_PyAccu_Accumulate(&acc, signobj))
13472 goto onError;
13473 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013474 if (width > len)
13475 width--;
13476 }
13477 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013478 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013479 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == c);
Benjamin Peterson29060642009-01-31 22:14:21 +000013480 if (fill != ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013481 second = get_latin1_char(
13482 PyUnicode_READ(kind, pbuf, pindex + 1));
13483 pindex += 2;
13484 if (second == NULL ||
13485 _PyAccu_Accumulate(&acc, zero) ||
13486 _PyAccu_Accumulate(&acc, second))
13487 goto onError;
13488 Py_CLEAR(second);
Benjamin Peterson29060642009-01-31 22:14:21 +000013489 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013490 width -= 2;
13491 if (width < 0)
13492 width = 0;
13493 len -= 2;
13494 }
13495 if (width > len && !(flags & F_LJUST)) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013496 assert(fillobj != NULL);
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013497 if (repeat_accumulate(&acc, fillobj, width - len))
13498 goto onError;
13499 width = len;
Benjamin Peterson29060642009-01-31 22:14:21 +000013500 }
13501 if (fill == ' ') {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013502 if (sign) {
13503 assert(signobj != NULL);
13504 if (_PyAccu_Accumulate(&acc, signobj))
13505 goto onError;
13506 }
Benjamin Peterson29060642009-01-31 22:14:21 +000013507 if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
13509 assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013510 second = get_latin1_char(
13511 PyUnicode_READ(kind, pbuf, pindex + 1));
13512 pindex += 2;
13513 if (second == NULL ||
13514 _PyAccu_Accumulate(&acc, zero) ||
13515 _PyAccu_Accumulate(&acc, second))
13516 goto onError;
13517 Py_CLEAR(second);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013518 }
13519 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 /* Copy all characters, preserving len */
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013521 if (temp != NULL) {
13522 assert(pbuf == PyUnicode_DATA(temp));
13523 v = PyUnicode_Substring(temp, pindex, pindex + len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013524 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013525 else {
13526 const char *p = (const char *) pbuf;
13527 assert(pbuf != NULL);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013528 p += kind * pindex;
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013529 v = PyUnicode_FromKindAndData(kind, p, len);
13530 }
13531 if (v == NULL)
13532 goto onError;
13533 r = _PyAccu_Accumulate(&acc, v);
13534 Py_DECREF(v);
13535 if (r)
13536 goto onError;
Antoine Pitrou978b9d22011-10-07 12:35:48 +020013537 if (width > len && repeat_accumulate(&acc, blank, width - len))
13538 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +000013539 if (dict && (argidx < arglen) && c != '%') {
13540 PyErr_SetString(PyExc_TypeError,
13541 "not all arguments converted during string formatting");
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 goto onError;
13543 }
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013544 Py_CLEAR(temp);
Benjamin Peterson29060642009-01-31 22:14:21 +000013545 } /* '%' */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013546 } /* until end */
13547 if (argidx < arglen && !dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 PyErr_SetString(PyExc_TypeError,
13549 "not all arguments converted during string formatting");
13550 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013551 }
13552
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013553 result = _PyAccu_Finish(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013554 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013556 }
13557 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013558 Py_XDECREF(temp);
13559 Py_XDECREF(second);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013560 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013561
Benjamin Peterson29060642009-01-31 22:14:21 +000013562 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013563 Py_DECREF(uformat);
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020013564 Py_XDECREF(temp);
13565 Py_XDECREF(second);
13566 _PyAccu_Destroy(&acc);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013567 if (args_owned) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013568 Py_DECREF(args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013569 }
13570 return NULL;
13571}
13572
Jeremy Hylton938ace62002-07-17 16:30:39 +000013573static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000013574unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
13575
Tim Peters6d6c1a32001-08-02 04:15:00 +000013576static PyObject *
13577unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13578{
Benjamin Peterson29060642009-01-31 22:14:21 +000013579 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013580 static char *kwlist[] = {"object", "encoding", "errors", 0};
13581 char *encoding = NULL;
13582 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000013583
Benjamin Peterson14339b62009-01-31 16:36:08 +000013584 if (type != &PyUnicode_Type)
13585 return unicode_subtype_new(type, args, kwds);
13586 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000013587 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013588 return NULL;
Victor Stinner382955f2011-12-11 21:44:00 +010013589 if (x == NULL) {
13590 Py_INCREF(unicode_empty);
13591 return unicode_empty;
13592 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013593 if (encoding == NULL && errors == NULL)
13594 return PyObject_Str(x);
13595 else
Benjamin Peterson29060642009-01-31 22:14:21 +000013596 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000013597}
13598
Guido van Rossume023fe02001-08-30 03:12:59 +000013599static PyObject *
13600unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13601{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013602 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013603 Py_ssize_t length, char_size;
13604 int share_wstr, share_utf8;
13605 unsigned int kind;
13606 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000013607
Benjamin Peterson14339b62009-01-31 16:36:08 +000013608 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013609
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013610 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013611 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013612 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013613 assert(_PyUnicode_CHECK(unicode));
Victor Stinnere06e1452011-10-04 20:52:31 +020013614 if (PyUnicode_READY(unicode))
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013615 return NULL;
13616
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013617 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013618 if (self == NULL) {
13619 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013620 return NULL;
13621 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013622 kind = PyUnicode_KIND(unicode);
13623 length = PyUnicode_GET_LENGTH(unicode);
13624
13625 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013626#ifdef Py_DEBUG
13627 _PyUnicode_HASH(self) = -1;
13628#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013629 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013630#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013631 _PyUnicode_STATE(self).interned = 0;
13632 _PyUnicode_STATE(self).kind = kind;
13633 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020013634 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013635 _PyUnicode_STATE(self).ready = 1;
13636 _PyUnicode_WSTR(self) = NULL;
13637 _PyUnicode_UTF8_LENGTH(self) = 0;
13638 _PyUnicode_UTF8(self) = NULL;
13639 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020013640 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013641
13642 share_utf8 = 0;
13643 share_wstr = 0;
13644 if (kind == PyUnicode_1BYTE_KIND) {
13645 char_size = 1;
13646 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13647 share_utf8 = 1;
13648 }
13649 else if (kind == PyUnicode_2BYTE_KIND) {
13650 char_size = 2;
13651 if (sizeof(wchar_t) == 2)
13652 share_wstr = 1;
13653 }
13654 else {
13655 assert(kind == PyUnicode_4BYTE_KIND);
13656 char_size = 4;
13657 if (sizeof(wchar_t) == 4)
13658 share_wstr = 1;
13659 }
13660
13661 /* Ensure we won't overflow the length. */
13662 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13663 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013664 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013665 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013666 data = PyObject_MALLOC((length + 1) * char_size);
13667 if (data == NULL) {
13668 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 goto onError;
13670 }
13671
Victor Stinnerc3c74152011-10-02 20:39:55 +020013672 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013673 if (share_utf8) {
13674 _PyUnicode_UTF8_LENGTH(self) = length;
13675 _PyUnicode_UTF8(self) = data;
13676 }
13677 if (share_wstr) {
13678 _PyUnicode_WSTR_LENGTH(self) = length;
13679 _PyUnicode_WSTR(self) = (wchar_t *)data;
13680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013681
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013682 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013683 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013684 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020013685#ifdef Py_DEBUG
13686 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13687#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020013688 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010013689 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020013690
13691onError:
13692 Py_DECREF(unicode);
13693 Py_DECREF(self);
13694 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000013695}
13696
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013697PyDoc_STRVAR(unicode_doc,
Benjamin Peterson29060642009-01-31 22:14:21 +000013698 "str(string[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000013699\n\
Collin Winterd474ce82007-08-07 19:42:11 +000013700Create a new string object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +000013701encoding defaults to the current default string encoding.\n\
13702errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000013703
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013704static PyObject *unicode_iter(PyObject *seq);
13705
Guido van Rossumd57fd912000-03-10 22:53:23 +000013706PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000013707 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000013708 "str", /* tp_name */
13709 sizeof(PyUnicodeObject), /* tp_size */
13710 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013711 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013712 (destructor)unicode_dealloc, /* tp_dealloc */
13713 0, /* tp_print */
13714 0, /* tp_getattr */
13715 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000013716 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013717 unicode_repr, /* tp_repr */
13718 &unicode_as_number, /* tp_as_number */
13719 &unicode_as_sequence, /* tp_as_sequence */
13720 &unicode_as_mapping, /* tp_as_mapping */
13721 (hashfunc) unicode_hash, /* tp_hash*/
13722 0, /* tp_call*/
13723 (reprfunc) unicode_str, /* tp_str */
13724 PyObject_GenericGetAttr, /* tp_getattro */
13725 0, /* tp_setattro */
13726 0, /* tp_as_buffer */
13727 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000013728 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000013729 unicode_doc, /* tp_doc */
13730 0, /* tp_traverse */
13731 0, /* tp_clear */
13732 PyUnicode_RichCompare, /* tp_richcompare */
13733 0, /* tp_weaklistoffset */
13734 unicode_iter, /* tp_iter */
13735 0, /* tp_iternext */
13736 unicode_methods, /* tp_methods */
13737 0, /* tp_members */
13738 0, /* tp_getset */
13739 &PyBaseObject_Type, /* tp_base */
13740 0, /* tp_dict */
13741 0, /* tp_descr_get */
13742 0, /* tp_descr_set */
13743 0, /* tp_dictoffset */
13744 0, /* tp_init */
13745 0, /* tp_alloc */
13746 unicode_new, /* tp_new */
13747 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013748};
13749
13750/* Initialize the Unicode implementation */
13751
Victor Stinner3a50e702011-10-18 21:21:00 +020013752int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013753{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013754 int i;
13755
Thomas Wouters477c8d52006-05-27 19:21:47 +000013756 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013757 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000013758 0x000A, /* LINE FEED */
13759 0x000D, /* CARRIAGE RETURN */
13760 0x001C, /* FILE SEPARATOR */
13761 0x001D, /* GROUP SEPARATOR */
13762 0x001E, /* RECORD SEPARATOR */
13763 0x0085, /* NEXT LINE */
13764 0x2028, /* LINE SEPARATOR */
13765 0x2029, /* PARAGRAPH SEPARATOR */
13766 };
13767
Fred Drakee4315f52000-05-09 19:53:39 +000013768 /* Init the implementation */
Victor Stinnera464fc12011-10-02 20:39:30 +020013769 unicode_empty = PyUnicode_New(0, 0);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013770 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013771 Py_FatalError("Can't create empty string");
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010013772 assert(_PyUnicode_CheckConsistency(unicode_empty, 1));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013773
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013774 for (i = 0; i < 256; i++)
Benjamin Peterson29060642009-01-31 22:14:21 +000013775 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +000013776 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013777 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000013778
13779 /* initialize the linebreak bloom filter */
13780 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013781 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020013782 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013783
13784 PyType_Ready(&EncodingMapType);
Victor Stinner3a50e702011-10-18 21:21:00 +020013785
13786#ifdef HAVE_MBCS
13787 winver.dwOSVersionInfoSize = sizeof(winver);
13788 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
13789 PyErr_SetFromWindowsErr(0);
13790 return -1;
13791 }
13792#endif
13793 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013794}
13795
13796/* Finalize the Unicode implementation */
13797
Christian Heimesa156e092008-02-16 07:38:31 +000013798int
13799PyUnicode_ClearFreeList(void)
13800{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000013802}
13803
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804void
Thomas Wouters78890102000-07-22 19:25:51 +000013805_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013806{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013807 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808
Guido van Rossum4ae8ef82000-10-03 18:09:04 +000013809 Py_XDECREF(unicode_empty);
13810 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +000013811
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013812 for (i = 0; i < 256; i++) {
Benjamin Peterson29060642009-01-31 22:14:21 +000013813 if (unicode_latin1[i]) {
13814 Py_DECREF(unicode_latin1[i]);
13815 unicode_latin1[i] = NULL;
13816 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000013817 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020013818 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000013819 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000013821
Walter Dörwald16807132007-05-25 13:52:07 +000013822void
13823PyUnicode_InternInPlace(PyObject **p)
13824{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013825 register PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013826 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020013827#ifdef Py_DEBUG
13828 assert(s != NULL);
13829 assert(_PyUnicode_CHECK(s));
13830#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000013831 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020013832 return;
13833#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000013834 /* If it's a subclass, we don't really know what putting
13835 it in the interned dict might do. */
13836 if (!PyUnicode_CheckExact(s))
13837 return;
13838 if (PyUnicode_CHECK_INTERNED(s))
13839 return;
13840 if (interned == NULL) {
13841 interned = PyDict_New();
13842 if (interned == NULL) {
13843 PyErr_Clear(); /* Don't leave an exception */
13844 return;
13845 }
13846 }
13847 /* It might be that the GetItem call fails even
13848 though the key is present in the dictionary,
13849 namely when this happens during a stack overflow. */
13850 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010013851 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013852 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000013853
Benjamin Peterson29060642009-01-31 22:14:21 +000013854 if (t) {
13855 Py_INCREF(t);
13856 Py_DECREF(*p);
13857 *p = t;
13858 return;
13859 }
Walter Dörwald16807132007-05-25 13:52:07 +000013860
Benjamin Peterson14339b62009-01-31 16:36:08 +000013861 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010013862 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013863 PyErr_Clear();
13864 PyThreadState_GET()->recursion_critical = 0;
13865 return;
13866 }
13867 PyThreadState_GET()->recursion_critical = 0;
13868 /* The two references in interned are not counted by refcnt.
13869 The deallocator will take care of this */
13870 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013871 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000013872}
13873
13874void
13875PyUnicode_InternImmortal(PyObject **p)
13876{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013877 PyUnicode_InternInPlace(p);
13878 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020013879 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013880 Py_INCREF(*p);
13881 }
Walter Dörwald16807132007-05-25 13:52:07 +000013882}
13883
13884PyObject *
13885PyUnicode_InternFromString(const char *cp)
13886{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013887 PyObject *s = PyUnicode_FromString(cp);
13888 if (s == NULL)
13889 return NULL;
13890 PyUnicode_InternInPlace(&s);
13891 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000013892}
13893
Alexander Belopolsky40018472011-02-26 01:02:56 +000013894void
13895_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000013896{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013897 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013898 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013899 Py_ssize_t i, n;
13900 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000013901
Benjamin Peterson14339b62009-01-31 16:36:08 +000013902 if (interned == NULL || !PyDict_Check(interned))
13903 return;
13904 keys = PyDict_Keys(interned);
13905 if (keys == NULL || !PyList_Check(keys)) {
13906 PyErr_Clear();
13907 return;
13908 }
Walter Dörwald16807132007-05-25 13:52:07 +000013909
Benjamin Peterson14339b62009-01-31 16:36:08 +000013910 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13911 detector, interned unicode strings are not forcibly deallocated;
13912 rather, we give them their stolen references back, and then clear
13913 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000013914
Benjamin Peterson14339b62009-01-31 16:36:08 +000013915 n = PyList_GET_SIZE(keys);
13916 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000013917 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013918 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013919 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013920 if (PyUnicode_READY(s) == -1) {
13921 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013922 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020013923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013924 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013925 case SSTATE_NOT_INTERNED:
13926 /* XXX Shouldn't happen */
13927 break;
13928 case SSTATE_INTERNED_IMMORTAL:
13929 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013930 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013931 break;
13932 case SSTATE_INTERNED_MORTAL:
13933 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013934 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013935 break;
13936 default:
13937 Py_FatalError("Inconsistent interned string state.");
13938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013939 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013940 }
13941 fprintf(stderr, "total size of all interned strings: "
13942 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13943 "mortal/immortal\n", mortal_size, immortal_size);
13944 Py_DECREF(keys);
13945 PyDict_Clear(interned);
13946 Py_DECREF(interned);
13947 interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +000013948}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013949
13950
13951/********************* Unicode Iterator **************************/
13952
13953typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013954 PyObject_HEAD
13955 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013956 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013957} unicodeiterobject;
13958
13959static void
13960unicodeiter_dealloc(unicodeiterobject *it)
13961{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013962 _PyObject_GC_UNTRACK(it);
13963 Py_XDECREF(it->it_seq);
13964 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013965}
13966
13967static int
13968unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13969{
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 Py_VISIT(it->it_seq);
13971 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013972}
13973
13974static PyObject *
13975unicodeiter_next(unicodeiterobject *it)
13976{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013977 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013978
Benjamin Peterson14339b62009-01-31 16:36:08 +000013979 assert(it != NULL);
13980 seq = it->it_seq;
13981 if (seq == NULL)
13982 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020013983 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013985 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13986 int kind = PyUnicode_KIND(seq);
13987 void *data = PyUnicode_DATA(seq);
13988 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13989 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000013990 if (item != NULL)
13991 ++it->it_index;
13992 return item;
13993 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013994
Benjamin Peterson14339b62009-01-31 16:36:08 +000013995 Py_DECREF(seq);
13996 it->it_seq = NULL;
13997 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000013998}
13999
14000static PyObject *
14001unicodeiter_len(unicodeiterobject *it)
14002{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014003 Py_ssize_t len = 0;
14004 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020014005 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014006 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014007}
14008
14009PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14010
14011static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014012 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000014013 length_hint_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000014014 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014015};
14016
14017PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014018 PyVarObject_HEAD_INIT(&PyType_Type, 0)
14019 "str_iterator", /* tp_name */
14020 sizeof(unicodeiterobject), /* tp_basicsize */
14021 0, /* tp_itemsize */
14022 /* methods */
14023 (destructor)unicodeiter_dealloc, /* tp_dealloc */
14024 0, /* tp_print */
14025 0, /* tp_getattr */
14026 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014027 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014028 0, /* tp_repr */
14029 0, /* tp_as_number */
14030 0, /* tp_as_sequence */
14031 0, /* tp_as_mapping */
14032 0, /* tp_hash */
14033 0, /* tp_call */
14034 0, /* tp_str */
14035 PyObject_GenericGetAttr, /* tp_getattro */
14036 0, /* tp_setattro */
14037 0, /* tp_as_buffer */
14038 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14039 0, /* tp_doc */
14040 (traverseproc)unicodeiter_traverse, /* tp_traverse */
14041 0, /* tp_clear */
14042 0, /* tp_richcompare */
14043 0, /* tp_weaklistoffset */
14044 PyObject_SelfIter, /* tp_iter */
14045 (iternextfunc)unicodeiter_next, /* tp_iternext */
14046 unicodeiter_methods, /* tp_methods */
14047 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014048};
14049
14050static PyObject *
14051unicode_iter(PyObject *seq)
14052{
Benjamin Peterson14339b62009-01-31 16:36:08 +000014053 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014054
Benjamin Peterson14339b62009-01-31 16:36:08 +000014055 if (!PyUnicode_Check(seq)) {
14056 PyErr_BadInternalCall();
14057 return NULL;
14058 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014059 if (PyUnicode_READY(seq) == -1)
14060 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014061 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14062 if (it == NULL)
14063 return NULL;
14064 it->it_index = 0;
14065 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014066 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 _PyObject_GC_TRACK(it);
14068 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014069}
14070
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010014071
14072size_t
14073Py_UNICODE_strlen(const Py_UNICODE *u)
14074{
14075 int res = 0;
14076 while(*u++)
14077 res++;
14078 return res;
14079}
14080
14081Py_UNICODE*
14082Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14083{
14084 Py_UNICODE *u = s1;
14085 while ((*u++ = *s2++));
14086 return s1;
14087}
14088
14089Py_UNICODE*
14090Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14091{
14092 Py_UNICODE *u = s1;
14093 while ((*u++ = *s2++))
14094 if (n-- == 0)
14095 break;
14096 return s1;
14097}
14098
14099Py_UNICODE*
14100Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14101{
14102 Py_UNICODE *u1 = s1;
14103 u1 += Py_UNICODE_strlen(u1);
14104 Py_UNICODE_strcpy(u1, s2);
14105 return s1;
14106}
14107
14108int
14109Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14110{
14111 while (*s1 && *s2 && *s1 == *s2)
14112 s1++, s2++;
14113 if (*s1 && *s2)
14114 return (*s1 < *s2) ? -1 : +1;
14115 if (*s1)
14116 return 1;
14117 if (*s2)
14118 return -1;
14119 return 0;
14120}
14121
14122int
14123Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14124{
14125 register Py_UNICODE u1, u2;
14126 for (; n != 0; n--) {
14127 u1 = *s1;
14128 u2 = *s2;
14129 if (u1 != u2)
14130 return (u1 < u2) ? -1 : +1;
14131 if (u1 == '\0')
14132 return 0;
14133 s1++;
14134 s2++;
14135 }
14136 return 0;
14137}
14138
14139Py_UNICODE*
14140Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14141{
14142 const Py_UNICODE *p;
14143 for (p = s; *p; p++)
14144 if (*p == c)
14145 return (Py_UNICODE*)p;
14146 return NULL;
14147}
14148
14149Py_UNICODE*
14150Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14151{
14152 const Py_UNICODE *p;
14153 p = s + Py_UNICODE_strlen(s);
14154 while (p != s) {
14155 p--;
14156 if (*p == c)
14157 return (Py_UNICODE*)p;
14158 }
14159 return NULL;
14160}
Victor Stinner331ea922010-08-10 16:37:20 +000014161
Victor Stinner71133ff2010-09-01 23:43:53 +000014162Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014163PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000014164{
Victor Stinner577db2c2011-10-11 22:12:48 +020014165 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014166 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000014167
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014168 if (!PyUnicode_Check(unicode)) {
14169 PyErr_BadArgument();
14170 return NULL;
14171 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014172 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020014173 if (u == NULL)
14174 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000014175 /* Ensure we won't overflow the size. */
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014176 if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000014177 PyErr_NoMemory();
14178 return NULL;
14179 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020014180 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000014181 size *= sizeof(Py_UNICODE);
14182 copy = PyMem_Malloc(size);
14183 if (copy == NULL) {
14184 PyErr_NoMemory();
14185 return NULL;
14186 }
Victor Stinner577db2c2011-10-11 22:12:48 +020014187 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000014188 return copy;
14189}
Martin v. Löwis5b222132007-06-10 09:51:05 +000014190
Georg Brandl66c221e2010-10-14 07:04:07 +000014191/* A _string module, to export formatter_parser and formatter_field_name_split
14192 to the string.Formatter class implemented in Python. */
14193
14194static PyMethodDef _string_methods[] = {
14195 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
14196 METH_O, PyDoc_STR("split the argument as a field name")},
14197 {"formatter_parser", (PyCFunction) formatter_parser,
14198 METH_O, PyDoc_STR("parse the argument as a format string")},
14199 {NULL, NULL}
14200};
14201
14202static struct PyModuleDef _string_module = {
14203 PyModuleDef_HEAD_INIT,
14204 "_string",
14205 PyDoc_STR("string helper module"),
14206 0,
14207 _string_methods,
14208 NULL,
14209 NULL,
14210 NULL,
14211 NULL
14212};
14213
14214PyMODINIT_FUNC
14215PyInit__string(void)
14216{
14217 return PyModule_Create(&_string_module);
14218}
14219
14220
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000014221#ifdef __cplusplus
14222}
14223#endif